diff --git a/marble/configs/test_config_db_single/test_config_db.yaml b/marble/configs/test_config_db_single/test_config_db.yaml new file mode 100644 index 00000000..e538259f --- /dev/null +++ b/marble/configs/test_config_db_single/test_config_db.yaml @@ -0,0 +1,45 @@ +coordinate_mode: star +relationships: [] +llm: "gpt-4o-mini" + +environment: + type: DB + name: "DB Simulation Environment" + max_iterations: 5 + anomalies: + - anomaly: MISSING_INDEXES + threads: 100 + ncolumn: 20 + nrow: 20000 + colsize: 100 + +communication: False + +task: + content: "Analyze the database alerts & outputs and find out the reason that caused it. The alerts might include: NodeMemSwapped, NodeLoadHigh, ... The reasons could be: ['INSERT_LARGE_DATA', 'MISSING_INDEXES','LOCK_CONTENTION','VACUUM','REDUNDANT_INDEX','INSERT_LARGE_DATA,IO_CONTENTION', 'FETCH_LARGE_DATA,CORRELATED_SUBQUERY','POOR_JOIN_PERFORMANCE,CPU_CONTENTION']. Only one of these reasons would apply. The planner should ask different experts to work on same task, and summarize their opinions into a final prediction. They can only do 3 things with tools. First is get alert. The second thing they can do is check whether a metric is abnormal using a statistical method. They can check: cpu_usage, memory_usage, network, and io. The third thing they can do is match diagnostic knowledge based on the expert and the four metrics, to guess what has caused the problem." + output_format: "The alerts might include: NodeMemSwapped, NodeLoadHigh, ... Please choose the most likely cause of the database anomaly from the following list, based on the expert agents: ['INSERT_LARGE_DATA', 'MISSING_INDEXES','LOCK_CONTENTION','VACUUM','REDUNDANT_INDEX','INSERT_LARGE_DATA,IO_CONTENTION', 'FETCH_LARGE_DATA,CORRELATED_SUBQUERY','POOR_JOIN_PERFORMANCE,CPU_CONTENTION']. You can ONLY CHOOSE ONE." + +agents: + - type: BaseAgent + agent_id: ConfigurationExpert + profile: "ConfigurationExpert specializes in system configurations and optimizations." + - type: BaseAgent + agent_id: CpuExpert + profile: "CpuExpert is knowledgeable in CPU architecture, performance, and optimizations." + - type: BaseAgent + agent_id: WorkloadExpert + profile: "WorkloadExpert excels in analyzing workloads, resource allocation, and optimization for efficiency." + +memory: + type: SharedMemory + # Additional memory configurations if needed + +metrics: {} + # Define metrics configurations for the Evaluator + # Example: + # accuracy: true + # response_time: true + +engine_planner: + initial_progress: "Starting the simulation." + # Additional engine planner configurations if needed \ No newline at end of file diff --git a/marble/configs/test_config_db_testset/test_config_db.yaml b/marble/configs/test_config_db_testset/test_config_db.yaml new file mode 100644 index 00000000..d54c380b --- /dev/null +++ b/marble/configs/test_config_db_testset/test_config_db.yaml @@ -0,0 +1,40 @@ +coordinate_mode: star +relationships: [] +llm: "gpt-4o-mini" + +environment: + type: DB + name: "DB Simulation Environment" + max_iterations: 5 + anomalies: [] + +communication: False + +task: + content: "Analyze the database alerts & outputs and find out the reason that caused it. The alerts might include: NodeMemSwapped, NodeLoadHigh, ... The reasons could be: ['INSERT_LARGE_DATA', 'MISSING_INDEXES','LOCK_CONTENTION','VACUUM','REDUNDANT_INDEX','INSERT_LARGE_DATA,IO_CONTENTION', 'FETCH_LARGE_DATA,CORRELATED_SUBQUERY','POOR_JOIN_PERFORMANCE,CPU_CONTENTION']. Only one of these reasons would apply. The planner should ask different experts to work on same task, and summarize their opinions into a final prediction. They can only do 3 things with tools. First is get alert. The second thing they can do is check whether a metric is abnormal using a statistical method. They can check: cpu_usage, memory_usage, network, and io. The third thing they can do is match diagnostic knowledge based on the expert and the four metrics, to guess what has caused the problem." + output_format: "The alerts might include: NodeMemSwapped, NodeLoadHigh, ... Please choose the most likely cause of the database anomaly from the following list, based on the expert agents: ['INSERT_LARGE_DATA', 'MISSING_INDEXES','LOCK_CONTENTION','VACUUM','REDUNDANT_INDEX','INSERT_LARGE_DATA,IO_CONTENTION', 'FETCH_LARGE_DATA,CORRELATED_SUBQUERY','POOR_JOIN_PERFORMANCE,CPU_CONTENTION']. You can ONLY CHOOSE ONE." + +agents: + - type: BaseAgent + agent_id: ConfigurationExpert + profile: "ConfigurationExpert specializes in system configurations and optimizations." + - type: BaseAgent + agent_id: CpuExpert + profile: "CpuExpert is knowledgeable in CPU architecture, performance, and optimizations." + - type: BaseAgent + agent_id: WorkloadExpert + profile: "WorkloadExpert excels in analyzing workloads, resource allocation, and optimization for efficiency." + +memory: + type: SharedMemory + # Additional memory configurations if needed + +metrics: {} + # Define metrics configurations for the Evaluator + # Example: + # accuracy: true + # response_time: true + +engine_planner: + initial_progress: "Starting the simulation." + # Additional engine planner configurations if needed \ No newline at end of file diff --git a/marble/engine/engine.py b/marble/engine/engine.py index 815e8d0d..88befa40 100644 --- a/marble/engine/engine.py +++ b/marble/engine/engine.py @@ -9,7 +9,7 @@ from marble.agent import BaseAgent from marble.configs.config import Config from marble.engine.engine_planner import EnginePlanner -from marble.environments import BaseEnvironment, ResearchEnvironment, WebEnvironment +from marble.environments import BaseEnvironment, ResearchEnvironment, WebEnvironment, DBEnvironment from marble.evaluator.evaluator import Evaluator from marble.graph.agent_graph import AgentGraph from marble.memory.base_memory import BaseMemory @@ -80,6 +80,9 @@ def _initialize_environment(self, env_config: Dict[str, Any]) -> BaseEnvironment elif env_type == "Research": env3 = ResearchEnvironment(name="Research Environment", config=env_config) return env3 + elif env_type == "DB": + env3 = DBEnvironment(name="Database Environment", config=env_config) + return env3 else: raise ValueError(f"Unsupported environment type: {env_type}") diff --git a/marble/environments/__init__.py b/marble/environments/__init__.py index 59f8dbf1..bb3d2268 100644 --- a/marble/environments/__init__.py +++ b/marble/environments/__init__.py @@ -1,9 +1,11 @@ from .base_env import BaseEnvironment from .research_env import ResearchEnvironment from .web_env import WebEnvironment +from .db_env import DBEnvironment __all__ = [ 'BaseEnvironment', 'WebEnvironment', - 'ResearchEnvironment' + 'ResearchEnvironment', + 'DBEnvironment' ] diff --git a/marble/environments/db_env.py b/marble/environments/db_env.py index 13169c48..ca3b0e9f 100644 --- a/marble/environments/db_env.py +++ b/marble/environments/db_env.py @@ -2,15 +2,21 @@ import subprocess import time from typing import Any, Dict, List +import re +import psycopg2 +from psycopg2 import OperationalError import numpy as np import requests +from marble.environments.db_utils.anomaly_detection import detect_anomalies, describe_data_features from marble.environments.base_env import BaseEnvironment -from marble.environments.db_utils.anomaly_detection import detect_anomalies +from marble.environments.db_utils.metrics import allowed_metrics_full_names, full_metrics_full_names +from marble.environments.db_utils.diagnostic_kb import DiagnosticKB +from marble.environments.db_utils.slow_query import obtain_slow_queries -def get_prometheus_metric_data(metric_name: str) -> List[Any]: +def get_prometheus_metric_data(metric_name: str) -> List[List[Any]]: """ Query Prometheus for the given metric data from the last hour, sampling every 60 seconds. @@ -18,37 +24,37 @@ def get_prometheus_metric_data(metric_name: str) -> List[Any]: metric_name (str): The name of the metric to retrieve (e.g., 'node:cpu:usage_avg1m'). Returns: - List[List[Any]]: A list of timestamp-value pairs for the metric over the past hour. + List[List[Any]]: A list of timestamp-value pairs for the metric over the past 10 minutes. """ # Get the current time in Unix timestamp end_time = time.time() - # Calculate the start time (one hour ago) - start_time = end_time - 3600 # 3600 seconds = 1 hour + # Calculate the start time (10 minutes ago) + start_time = end_time - 600 # 600 seconds = 10 minutes # Prometheus query range URL prom_url = 'http://localhost:9090/api/v1/query_range' # Parameters for the query - # params = { - # 'query': metric_name, - # 'start': start_time, - # 'end': end_time, - # 'step': 60 # sample every 60 seconds - # } - - prom_url_with_params = f"{prom_url}?query={metric_name}&start={start_time}&end={end_time}&step=60" + params = { + 'query': metric_name, + 'start': start_time, + 'end': end_time, + 'step': 1, # sample every second + } # Make the HTTP request to Prometheus - response = requests.get(prom_url_with_params) + response = requests.get(prom_url, params=params) # Check if the request was successful if response.status_code == 200: data = response.json() if data.get('status') == 'success': # Extract the values (timestamp-value pairs) from the response - assert isinstance(data['data']['result'][0]['values'], list) - return data['data']['result'][0]['values'] + try: + return data['data']['result'][0]['values'] + except: + return [] else: raise ValueError(f"Prometheus returned an error: {data.get('error', 'Unknown error')}") else: @@ -64,49 +70,80 @@ def __init__(self, config: Dict[str, Any], name: str = "DBEnv"): """ super().__init__(name, config) - os.chdir('./db_env_docker') + self.kb = DiagnosticKB() + + self.current_dir = os.path.dirname(os.path.abspath(__file__)) print("Starting Docker containers...") # Run docker-compose up in detached mode - subprocess.run(["docker", "compose", "down", "-v"], shell=False, check=True) + subprocess.run(["docker", "compose", "down", "-v"], cwd=os.path.join(self.current_dir, "db_env_docker"), shell=False, check=True) # Then, run "docker-compose up - subprocess.run(["docker", "compose", "up", "-d", "--remove-orphans"], check=True) + subprocess.run(["docker", "compose", "up", "-d", "--remove-orphans"], cwd=os.path.join(self.current_dir, "db_env_docker"), check=True) # anomalies - env_configs = config.get('environment', []) - if env_configs: - anomalies = config.get('anomalies', []) + anomalies = config.get('anomalies', []) + + is_db_up = False + while True: + try: + is_db_up = 1 + if is_db_up: + break + except: + pass + print(f'DB up and running') + + + if anomalies: + for anomaly in anomalies: anomaly_type = anomaly['anomaly'] threads = anomaly['threads'] ncolumn = anomaly['ncolumn'] colsize = anomaly['colsize'] - subprocess.run(["python3", "anomaly_trigger/main.py", "--anomaly", anomaly_type, "--threads", f"{threads}", "--ncolumn", f"{ncolumn}", "--colsize", f"{colsize}"], check=True) + + subprocess.run(["python", "main.py", "--anomaly", anomaly_type, "--threads", f"{threads}", "--ncolumn", f"{ncolumn}", "--colsize", f"{colsize}"], cwd=os.path.join(self.current_dir, "db_env_docker", "anomaly_trigger"), check=True) # Register the actions available in this environment self.register_action( - "whether_is_abnormal_metric", - handler=self.whether_is_abnormal_metric_handler, + "get_alerts", + handler=self.get_alerts_handler, + description={ + "type": "function", + "function": { + "name": "get_alerts", + "description": "Get current alerts from the database monitoring system. Returns information about any active alerts including their names, descriptions, and severity levels.", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + "additionalProperties": False + } + } + } + ) + + self.register_action( + "detect_metric_abnormality", + handler=self.detect_metric_abnormality_handler, description={ "type": "function", "function": { - "name": "whether_is_abnormal_metric", - "description": "Check if an metric of the database system is abnormal or not.", + "name": "detect_metric_abnormality", + "description": "Check if a type of metric of the database system is abnormal or not using a staticical method. This is used for initial checking where has gone wrong.", "parameters": { "type": "object", "properties": { "metric_name": { "type": "string", - "description": "The name of the metric to check for anormalies. It will examine the data from the last hour, sampling every 60 seconds. Anomalies are checked using the KS test algorithm.", + "description": "The name of the metric to check for anormalies. It will examine the data from the last 10 minutes, sampling every second. Anomalies are checked using the KS test algorithm.", "enum": [ "cpu_usage", - "disk_io", - "disk_read", - "disk_write", - "mem_usage", - "space_usage" + "memory_usage", + "network_traffic", + "io_activity" ] } }, @@ -117,59 +154,217 @@ def __init__(self, config: Dict[str, Any], name: str = "DBEnv"): } ) - # TODO: match_diagnose_knowledge, optimize_index_selection + self.register_action( + "match_diagnose_knowledge", + handler=self.match_diagnose_knowledge_handler, + description={ + "type": "function", + "function": { + "name": "match_diagnose_knowledge", + "description": "Check if a type of metric of the database system is abnormal or not using a staticical method across all related metrics.", + "parameters": { + "type": "object", + "properties": { + "expert": { + "type": "string", + "description": "The type of expert to consult", + "enum": [ + "ConfigurationExpert", + "CpuExpert", + "DiskExpert", + "IndexExpert", + "IoExpert", + "MemoryExpert", + "QueryExpert", + "RecoveryExpert", + "WorkloadExpert" + ] + }, + "metric_name": { + "type": "string", + "description": "The type of metric to check for anormalies. It will examine the data from the last 10 minutes, sampling every second. Anomalies are checked using the KS test algorithm.", + "enum": [ + "cpu", + "memory", + "network", + "io" + ] + } + }, + "required": ["expert", "metric_name"], + "additionalProperties": False + } + } + } + ) + + is_initialized = False + alerts = [] + while True: + try: + alerts = self.get_raw_alerts()['alerts'] + time.sleep(1) + if len(alerts): + is_initialized = True + break + except: + pass + print(f'Alert detected @ {alerts}') - def whether_is_abnormal_metric_handler(self, metric_name: str) -> Dict[str, Any]: - #try: - if True: - # Get the metric data from Prometheus - metric_name_mapper = { - "cpu_usage": "node:cpu:usage_avg1m", - "disk_io": "node:cls:disk_io_bytes_rate1m", - "disk_read": "node:cls:disk_read_bytes_rate1m", - "disk_write":"node:cls:disk_write_bytes_rate1m", - "mem_usage": "node:cls:mem_usage", - "space_usage": "node:cls:space_usage", + def get_alerts_handler(self) -> Dict[str, Any]: + """ + Handler function to get current alerts from Prometheus. + + Returns: + Dict[str, Any]: Dictionary containing alert information in a structured format + """ + try: + alerts = self.get_raw_alerts() + formatted_alerts = [] + + for alert in alerts.get('alerts', []): + formatted_alert = { + 'name': alert['labels'].get('alertname', 'Unknown'), + 'severity': alert['labels'].get('severity', 'Unknown'), + 'description': alert['annotations'].get('description', ''), + 'state': alert.get('state', ''), + 'active_since': alert.get('activeAt', ''), + 'value': alert.get('value', '') + } + formatted_alerts.append(formatted_alert) + + return { + 'status': 'success', + 'alert_count': len(formatted_alerts), + 'alerts': formatted_alerts + } + except Exception as e: + return { + 'status': 'error', + 'message': str(e), + 'alerts': [] } - metric_name_mapped = metric_name_mapper.get(metric_name, "") + + def detect_metric_abnormality_handler(self, metric_name: str) -> bool: + try: + # Get the metric data from Prometheus + metric_name_mapped = allowed_metrics_full_names.get(metric_name, "") if metric_name_mapped == "": raise ValueError(f"Access to {metric_name} currently not supported") print(metric_name_mapped) values = get_prometheus_metric_data(metric_name_mapped) if not len(values): print('No values yet. Please wait at least 15s.') - return {"success": False, "message": "Execution failed. No values yet. Please wait at least 15s."} + return False values_list = [float(v) for t, v in values] # Convert the list into a 1D NumPy array - values_array = np.array(values_list, dtype=np.float64) - ks_statistic, anomalies = detect_anomalies(values_array) - if np.any(anomalies): - print(f"Anomalies detected in the metric '{metric_name}'") - return {"success": True, "message": f"Anomalies detected in the metric '{metric_name}'; ks_statistic: {ks_statistic}, anomalies: {anomalies}"} - else: - print(f"No anomalies detected in the metric '{metric_name}'") - return {"success": True, "message": f"No anomalies detected in the metric '{metric_name}'"} + values_array = np.array(values_list) + return detect_anomalies(values_array) + except Exception as e: + print(f"Error fetching metric data: {e}") + return False - def get_alerts(self) -> Dict[str, Any]: - prom_url = 'http://localhost:9090/api/v1/alerts' + def match_diagnose_knowledge_handler(self, expert: str, metric_name: str) -> str: + # first, we get the alert metrics + alerts = self.get_raw_alerts() + alert_metrics = [] + alert_descriptions = [] + alert_metric_str = "" + for alert in alerts['alerts']: + alert_description = alert['annotations']['description'] + alert_metric = alert_description.split('[')[0] + alert_metrics.append(alert_metric.strip()) + alert_descriptions.append(alert_description) + + alert_metric_str += f"{alert_metric.strip()} triggered alert: {alert_description}. \n" + + anomaly_data = get_prometheus_metric_data(alert_metric) + anomaly_data_list = [float(v) for t, v in anomaly_data] + anomaly_data_array = np.array(anomaly_data_list) + anomaly_data_features = describe_data_features(anomaly_data_list) + + alert_metric_str += f"Data description for {alert_metric}: {anomaly_data_features} \n" + alert_metric_str += f"\n" + + llm_selected_metric_str = "" + for name in full_metrics_full_names[metric_name]: + query = full_metrics_full_names[metric_name][name] + data = get_prometheus_metric_data(query) + data_list = [float(v) for t, v in data] + data_array = np.array(data_list) + anomaly = detect_anomalies(data_array) + if anomaly[1]: + data_features = describe_data_features(data_list) + llm_selected_metric_str += f"{name} (Query: {query}) is abnormal.\n" + llm_selected_metric_str += f"Data description: {data_features}\n" + llm_selected_metric_str += f"\n" + + rag_str = f"" + self.kb.search(metric_name, expert=expert) + rag_str += f"For expert {expert}, the following knowledge is matched: \n" + + for alert_description in alert_descriptions: + rag_str += f"For the alert description you wanted to look into, here are the matched knowledge: \n" + for result in self.kb.search(alert_description, expert=expert, top_k=3): + rag_str += f"{result}:\n" + rag_str += f"Cause : {result['cause_name']}\n" + rag_str += f"Metrics: {result['metrics']}\n" + rag_str += f"Expert : {result['expert']}\n" + rag_str += f"\n" + + slow_query_str = f"Here are the commands that took longest time:\n" + slow_query_str += obtain_slow_queries() - # Make the HTTP request to Prometheus + rag_str += f"For the metric you wanted to look into, here are the matched knowledge: \n" + for result in self.kb.search(llm_selected_metric_str, expert=expert, top_k=3): + rag_str += f"{result}:\n" + rag_str += f"Cause : {result['cause_name']}\n" + rag_str += f"Metrics: {result['metrics']}\n" + rag_str += f"Expert : {result['expert']}\n" + rag_str += f"\n" + + return alert_metric_str + llm_selected_metric_str + slow_query_str + rag_str + + def get_raw_alerts(self) -> dict: + """ + Get raw alerts data from Prometheus. + + Returns: + dict: Raw alerts data from Prometheus + """ + prom_url = 'http://localhost:9090/api/v1/alerts' response = requests.get(prom_url) - # Check if the request was successful if response.status_code == 200: data = response.json() if data.get('status') == 'success': - # Extract the values (timestamp-value pairs) from the response - assert isinstance(data['data'], dict) return data['data'] else: raise ValueError(f"Prometheus returned an error: {data.get('error', 'Unknown error')}") else: raise ValueError(f"Failed to query Prometheus. Status code: {response.status_code}") + def check_db_connection(self): + """Check if the database is up and return True if successful, False otherwise.""" + try: + # Attempt to connect to PostgreSQL database + connection = psycopg2.connect( + user="test", + password="Test123_456", + database="sysbench", + host="localhost", # Use "postgres_db" if running within Docker + port="5432" + ) + print("Database is up!") + connection.close() + return True # Return True if connection is successful + + except OperationalError: + print("Database is not available.") + return False # Return False if connection fails + def terminate(self) -> None: - subprocess.run(["docker", "compose", "down"], check=True) + subprocess.run(["docker", "compose", "down"], cwd=os.path.join(self.current_dir, "db_env_docker"), check=True) if __name__ == "__main__": env = DBEnvironment(config={ @@ -188,9 +383,13 @@ def terminate(self) -> None: while True: command = input('> ') if command == 'alert': - print(env.get_alerts()) + print(env.get_alerts_handler()) elif command == 'cpu': - print(env.whether_is_abnormal_metric_handler('cpu_usage')) + print(env.detect_metric_abnormality_handler('cpu_usage')) + elif command == 'analyze': + print(env.match_diagnose_knowledge_handler('WorkloadExpert', 'cpu')) + elif command == 'slow': + print(obtain_slow_queries()) elif command == 'q': env.terminate() - break + break \ No newline at end of file diff --git a/marble/environments/db_env_docker/README.md b/marble/environments/db_env_docker/README.md index 7da43a6c..dc2a983a 100755 --- a/marble/environments/db_env_docker/README.md +++ b/marble/environments/db_env_docker/README.md @@ -1,3 +1,12 @@ +# Labels for Test Set +```python +['too many indexes', 'missing indexes', 'POOR JOIN PERFORMANCE', 'highly deletes', 'highly concurrent commits or highly concurrent inserts', 'CORRELATED SUBQUERY', 'INSERT_LARGE_DATA', 'FETCH_LARGE_DATA', 'IO_CONTENTION', 'CPU CONTENTION', 'highly concurrent updates'] +``` +There could be 1-2 root causes: +```python +[1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] +``` + # Prometheus And AlertManager Deployment Tutorial

diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/10a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/10a.sql new file mode 100644 index 00000000..13dcad63 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/10a.sql @@ -0,0 +1,22 @@ +SELECT MIN(chn.name) AS uncredited_voiced_character, + MIN(t.title) AS russian_movie +FROM char_name AS chn, + cast_info AS ci, + company_name AS cn, + company_type AS ct, + movie_companies AS mc, + role_type AS rt, + title AS t +WHERE ci.note LIKE '%(voice)%' + AND ci.note LIKE '%(uncredited)%' + AND cn.country_code = '[ru]' + AND rt.role = 'actor' + AND t.production_year > 2005 + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mc.movie_id + AND chn.id = ci.person_role_id + AND rt.id = ci.role_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/10b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/10b.sql new file mode 100644 index 00000000..916a78af --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/10b.sql @@ -0,0 +1,21 @@ +SELECT MIN(chn.name) AS character, + MIN(t.title) AS russian_mov_with_actor_producer +FROM char_name AS chn, + cast_info AS ci, + company_name AS cn, + company_type AS ct, + movie_companies AS mc, + role_type AS rt, + title AS t +WHERE ci.note LIKE '%(producer)%' + AND cn.country_code = '[ru]' + AND rt.role = 'actor' + AND t.production_year > 2010 + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mc.movie_id + AND chn.id = ci.person_role_id + AND rt.id = ci.role_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/10c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/10c.sql new file mode 100644 index 00000000..234c4e05 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/10c.sql @@ -0,0 +1,19 @@ +SELECT MIN(chn.name) AS character, + MIN(t.title) AS movie_with_american_producer +FROM char_name AS chn, + cast_info AS ci, + company_name AS cn, + company_type AS ct, + movie_companies AS mc, + role_type AS rt, + title AS t +WHERE ci.note LIKE '%(producer)%' + AND cn.country_code = '[us]' + AND t.production_year > 1990 + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mc.movie_id + AND chn.id = ci.person_role_id + AND rt.id = ci.role_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/11a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/11a.sql new file mode 100644 index 00000000..24bc6b62 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/11a.sql @@ -0,0 +1,30 @@ +SELECT MIN(cn.name) AS from_company, + MIN(lt.link) AS movie_link_type, + MIN(t.title) AS non_polish_sequel_movie +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND t.production_year BETWEEN 1950 AND 2000 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/11b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/11b.sql new file mode 100644 index 00000000..881ad7cb --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/11b.sql @@ -0,0 +1,31 @@ +SELECT MIN(cn.name) AS from_company, + MIN(lt.link) AS movie_link_type, + MIN(t.title) AS sequel_movie +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follows%' + AND mc.note IS NULL + AND t.production_year = 1998 + AND t.title LIKE '%Money%' + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/11c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/11c.sql new file mode 100644 index 00000000..03dd71b6 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/11c.sql @@ -0,0 +1,32 @@ +SELECT MIN(cn.name) AS from_company, + MIN(mc.note) AS production_note, + MIN(t.title) AS movie_based_on_book +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND (cn.name LIKE '20th Century Fox%' + OR cn.name LIKE 'Twentieth Century Fox%') + AND ct.kind != 'production companies' + AND ct.kind IS NOT NULL + AND k.keyword IN ('sequel', + 'revenge', + 'based-on-novel') + AND mc.note IS NOT NULL + AND t.production_year > 1950 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/11d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/11d.sql new file mode 100644 index 00000000..9872fda8 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/11d.sql @@ -0,0 +1,30 @@ +SELECT MIN(cn.name) AS from_company, + MIN(mc.note) AS production_note, + MIN(t.title) AS movie_based_on_book +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND ct.kind != 'production companies' + AND ct.kind IS NOT NULL + AND k.keyword IN ('sequel', + 'revenge', + 'based-on-novel') + AND mc.note IS NOT NULL + AND t.production_year > 1950 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/12a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/12a.sql new file mode 100644 index 00000000..e5b33664 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/12a.sql @@ -0,0 +1,30 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS drama_horror_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + title AS t +WHERE cn.country_code = '[us]' + AND ct.kind = 'production companies' + AND it1.info = 'genres' + AND it2.info = 'rating' + AND mi.info IN ('Drama', + 'Horror') + AND mi_idx.info > '8.0' + AND t.production_year BETWEEN 2005 AND 2008 + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND mi.info_type_id = it1.id + AND mi_idx.info_type_id = it2.id + AND t.id = mc.movie_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/12b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/12b.sql new file mode 100644 index 00000000..16bc8989 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/12b.sql @@ -0,0 +1,30 @@ +SELECT MIN(mi.info) AS budget, + MIN(t.title) AS unsuccsessful_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + title AS t +WHERE cn.country_code ='[us]' + AND ct.kind IS NOT NULL + AND (ct.kind ='production companies' + OR ct.kind = 'distributors') + AND it1.info ='budget' + AND it2.info ='bottom 10 rank' + AND t.production_year >2000 + AND (t.title LIKE 'Birdemic%' + OR t.title LIKE '%Movie%') + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND mi.info_type_id = it1.id + AND mi_idx.info_type_id = it2.id + AND t.id = mc.movie_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/12c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/12c.sql new file mode 100644 index 00000000..97e5a116 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/12c.sql @@ -0,0 +1,32 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS mainstream_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + title AS t +WHERE cn.country_code = '[us]' + AND ct.kind = 'production companies' + AND it1.info = 'genres' + AND it2.info = 'rating' + AND mi.info IN ('Drama', + 'Horror', + 'Western', + 'Family') + AND mi_idx.info > '7.0' + AND t.production_year BETWEEN 2000 AND 2010 + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND mi.info_type_id = it1.id + AND mi_idx.info_type_id = it2.id + AND t.id = mc.movie_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/13a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/13a.sql new file mode 100644 index 00000000..dfef6fe0 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/13a.sql @@ -0,0 +1,29 @@ +SELECT MIN(mi.info) AS release_date, + MIN(miidx.info) AS rating, + MIN(t.title) AS german_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it, + info_type AS it2, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS miidx, + title AS t +WHERE cn.country_code ='[de]' + AND ct.kind ='production companies' + AND it.info ='rating' + AND it2.info ='release dates' + AND kt.kind ='movie' + AND mi.movie_id = t.id + AND it2.id = mi.info_type_id + AND kt.id = t.kind_id + AND mc.movie_id = t.id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND miidx.movie_id = t.id + AND it.id = miidx.info_type_id + AND mi.movie_id = miidx.movie_id + AND mi.movie_id = mc.movie_id + AND miidx.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/13b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/13b.sql new file mode 100644 index 00000000..ecc262fe --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/13b.sql @@ -0,0 +1,32 @@ +SELECT MIN(cn.name) AS producing_company, + MIN(miidx.info) AS rating, + MIN(t.title) AS movie_about_winning +FROM company_name AS cn, + company_type AS ct, + info_type AS it, + info_type AS it2, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS miidx, + title AS t +WHERE cn.country_code ='[us]' + AND ct.kind ='production companies' + AND it.info ='rating' + AND it2.info ='release dates' + AND kt.kind ='movie' + AND t.title != '' + AND (t.title LIKE '%Champion%' + OR t.title LIKE '%Loser%') + AND mi.movie_id = t.id + AND it2.id = mi.info_type_id + AND kt.id = t.kind_id + AND mc.movie_id = t.id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND miidx.movie_id = t.id + AND it.id = miidx.info_type_id + AND mi.movie_id = miidx.movie_id + AND mi.movie_id = mc.movie_id + AND miidx.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/13c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/13c.sql new file mode 100644 index 00000000..921df955 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/13c.sql @@ -0,0 +1,32 @@ +SELECT MIN(cn.name) AS producing_company, + MIN(miidx.info) AS rating, + MIN(t.title) AS movie_about_winning +FROM company_name AS cn, + company_type AS ct, + info_type AS it, + info_type AS it2, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS miidx, + title AS t +WHERE cn.country_code ='[us]' + AND ct.kind ='production companies' + AND it.info ='rating' + AND it2.info ='release dates' + AND kt.kind ='movie' + AND t.title != '' + AND (t.title LIKE 'Champion%' + OR t.title LIKE 'Loser%') + AND mi.movie_id = t.id + AND it2.id = mi.info_type_id + AND kt.id = t.kind_id + AND mc.movie_id = t.id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND miidx.movie_id = t.id + AND it.id = miidx.info_type_id + AND mi.movie_id = miidx.movie_id + AND mi.movie_id = mc.movie_id + AND miidx.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/13d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/13d.sql new file mode 100644 index 00000000..734748bf --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/13d.sql @@ -0,0 +1,29 @@ +SELECT MIN(cn.name) AS producing_company, + MIN(miidx.info) AS rating, + MIN(t.title) AS movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it, + info_type AS it2, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS miidx, + title AS t +WHERE cn.country_code ='[us]' + AND ct.kind ='production companies' + AND it.info ='rating' + AND it2.info ='release dates' + AND kt.kind ='movie' + AND mi.movie_id = t.id + AND it2.id = mi.info_type_id + AND kt.id = t.kind_id + AND mc.movie_id = t.id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND miidx.movie_id = t.id + AND it.id = miidx.info_type_id + AND mi.movie_id = miidx.movie_id + AND mi.movie_id = mc.movie_id + AND miidx.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/14a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/14a.sql new file mode 100644 index 00000000..aaa86ca4 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/14a.sql @@ -0,0 +1,40 @@ +SELECT MIN(mi_idx.info) AS rating, + MIN(t.title) AS northern_dark_movie +FROM info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind = 'movie' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info < '8.5' + AND t.production_year > 2010 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/14b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/14b.sql new file mode 100644 index 00000000..e7364f47 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/14b.sql @@ -0,0 +1,41 @@ +SELECT MIN(mi_idx.info) AS rating, + MIN(t.title) AS western_dark_production +FROM info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title') + AND kt.kind = 'movie' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info > '6.0' + AND t.production_year > 2010 + AND (t.title LIKE '%murder%' + OR t.title LIKE '%Murder%' + OR t.title LIKE '%Mord%') + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/14c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/14c.sql new file mode 100644 index 00000000..bb1cf314 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/14c.sql @@ -0,0 +1,42 @@ +SELECT MIN(mi_idx.info) AS rating, + MIN(t.title) AS north_european_dark_production +FROM info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IS NOT NULL + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Danish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info < '8.5' + AND t.production_year > 2005 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/15a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/15a.sql new file mode 100644 index 00000000..4471b8c7 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/15a.sql @@ -0,0 +1,33 @@ +SELECT MIN(mi.info) AS release_date, + MIN(t.title) AS internet_movie +FROM aka_title AS at, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cn.country_code = '[us]' + AND it1.info = 'release dates' + AND mc.note LIKE '%(200%)%' + AND mc.note LIKE '%(worldwide)%' + AND mi.note LIKE '%internet%' + AND mi.info LIKE 'USA:% 200%' + AND t.production_year > 2000 + AND t.id = at.movie_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = at.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = at.movie_id + AND mc.movie_id = at.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/15b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/15b.sql new file mode 100644 index 00000000..425489b5 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/15b.sql @@ -0,0 +1,34 @@ +SELECT MIN(mi.info) AS release_date, + MIN(t.title) AS youtube_movie +FROM aka_title AS at, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cn.country_code = '[us]' + AND cn.name = 'YouTube' + AND it1.info = 'release dates' + AND mc.note LIKE '%(200%)%' + AND mc.note LIKE '%(worldwide)%' + AND mi.note LIKE '%internet%' + AND mi.info LIKE 'USA:% 200%' + AND t.production_year BETWEEN 2005 AND 2010 + AND t.id = at.movie_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = at.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = at.movie_id + AND mc.movie_id = at.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/15c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/15c.sql new file mode 100644 index 00000000..bdead025 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/15c.sql @@ -0,0 +1,33 @@ +SELECT MIN(mi.info) AS release_date, + MIN(t.title) AS modern_american_internet_movie +FROM aka_title AS at, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cn.country_code = '[us]' + AND it1.info = 'release dates' + AND mi.note LIKE '%internet%' + AND mi.info IS NOT NULL + AND (mi.info LIKE 'USA:% 199%' + OR mi.info LIKE 'USA:% 200%') + AND t.production_year > 1990 + AND t.id = at.movie_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = at.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = at.movie_id + AND mc.movie_id = at.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/15d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/15d.sql new file mode 100644 index 00000000..fc62cb53 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/15d.sql @@ -0,0 +1,30 @@ +SELECT MIN(at.title) AS aka_title, + MIN(t.title) AS internet_movie_title +FROM aka_title AS at, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cn.country_code = '[us]' + AND it1.info = 'release dates' + AND mi.note LIKE '%internet%' + AND t.production_year > 1990 + AND t.id = at.movie_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = at.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = at.movie_id + AND mc.movie_id = at.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/16a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/16a.sql new file mode 100644 index 00000000..7ce0bb06 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/16a.sql @@ -0,0 +1,26 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, + MIN(t.title) AS series_named_after_char +FROM aka_name AS an, + cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND t.episode_nr >= 50 + AND t.episode_nr < 100 + AND an.person_id = n.id + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND an.person_id = ci.person_id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/16b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/16b.sql new file mode 100644 index 00000000..8aa7371c --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/16b.sql @@ -0,0 +1,24 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, + MIN(t.title) AS series_named_after_char +FROM aka_name AS an, + cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND an.person_id = n.id + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND an.person_id = ci.person_id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/16c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/16c.sql new file mode 100644 index 00000000..91f4bbeb --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/16c.sql @@ -0,0 +1,25 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, + MIN(t.title) AS series_named_after_char +FROM aka_name AS an, + cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND t.episode_nr < 100 + AND an.person_id = n.id + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND an.person_id = ci.person_id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/16d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/16d.sql new file mode 100644 index 00000000..f7fce518 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/16d.sql @@ -0,0 +1,26 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, + MIN(t.title) AS series_named_after_char +FROM aka_name AS an, + cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND t.episode_nr >= 5 + AND t.episode_nr < 100 + AND an.person_id = n.id + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND an.person_id = ci.person_id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/17a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/17a.sql new file mode 100644 index 00000000..2db3358d --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/17a.sql @@ -0,0 +1,22 @@ +SELECT MIN(n.name) AS member_in_charnamed_american_movie, + MIN(n.name) AS a1 +FROM cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND n.name LIKE 'B%' + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/17b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/17b.sql new file mode 100644 index 00000000..7585e06d --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/17b.sql @@ -0,0 +1,21 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie, + MIN(n.name) AS a1 +FROM cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword ='character-name-in-title' + AND n.name LIKE 'Z%' + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/17c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/17c.sql new file mode 100644 index 00000000..8d177cb6 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/17c.sql @@ -0,0 +1,21 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie, + MIN(n.name) AS a1 +FROM cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword ='character-name-in-title' + AND n.name LIKE 'X%' + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/17d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/17d.sql new file mode 100644 index 00000000..9be724d6 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/17d.sql @@ -0,0 +1,20 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie +FROM cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword ='character-name-in-title' + AND n.name LIKE '%Bert%' + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/17e.sql b/marble/environments/db_env_docker/join-order-benchmark-master/17e.sql new file mode 100644 index 00000000..784902e7 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/17e.sql @@ -0,0 +1,20 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie +FROM cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/17f.sql b/marble/environments/db_env_docker/join-order-benchmark-master/17f.sql new file mode 100644 index 00000000..35bb0c49 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/17f.sql @@ -0,0 +1,20 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie +FROM cast_info AS ci, + company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword ='character-name-in-title' + AND n.name LIKE '%B%' + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.movie_id = mc.movie_id + AND ci.movie_id = mk.movie_id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/18a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/18a.sql new file mode 100644 index 00000000..edd21f3f --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/18a.sql @@ -0,0 +1,26 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(t.title) AS movie_title +FROM cast_info AS ci, + info_type AS it1, + info_type AS it2, + movie_info AS mi, + movie_info_idx AS mi_idx, + name AS n, + title AS t +WHERE ci.note IN ('(producer)', + '(executive producer)') + AND it1.info = 'budget' + AND it2.info = 'votes' + AND n.gender = 'm' + AND n.name LIKE '%Tim%' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/18b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/18b.sql new file mode 100644 index 00000000..03e685d8 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/18b.sql @@ -0,0 +1,34 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(t.title) AS movie_title +FROM cast_info AS ci, + info_type AS it1, + info_type AS it2, + movie_info AS mi, + movie_info_idx AS mi_idx, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'rating' + AND mi.info IN ('Horror', + 'Thriller') + AND mi.note IS NULL + AND mi_idx.info > '8.0' + AND n.gender IS NOT NULL + AND n.gender = 'f' + AND t.production_year BETWEEN 2008 AND 2014 + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/18c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/18c.sql new file mode 100644 index 00000000..9762e815 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/18c.sql @@ -0,0 +1,34 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(t.title) AS movie_title +FROM cast_info AS ci, + info_type AS it1, + info_type AS it2, + movie_info AS mi, + movie_info_idx AS mi_idx, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND mi.info IN ('Horror', + 'Action', + 'Sci-Fi', + 'Thriller', + 'Crime', + 'War') + AND n.gender = 'm' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND mi.movie_id = mi_idx.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/19a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/19a.sql new file mode 100644 index 00000000..68a0eb69 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/19a.sql @@ -0,0 +1,42 @@ +SELECT MIN(n.name) AS voicing_actress, + MIN(t.title) AS voiced_movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND mc.note IS NOT NULL + AND (mc.note LIKE '%(USA)%' + OR mc.note LIKE '%(worldwide)%') + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%200%' + OR mi.info LIKE 'USA:%200%') + AND n.gender ='f' + AND n.name LIKE '%Ang%' + AND rt.role ='actress' + AND t.production_year BETWEEN 2005 AND 2009 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mi.movie_id = ci.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/19b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/19b.sql new file mode 100644 index 00000000..c2c5ba73 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/19b.sql @@ -0,0 +1,40 @@ +SELECT MIN(n.name) AS voicing_actress, + MIN(t.title) AS kung_fu_panda +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note = '(voice)' + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND mc.note LIKE '%(200%)%' + AND (mc.note LIKE '%(USA)%' + OR mc.note LIKE '%(worldwide)%') + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%2007%' + OR mi.info LIKE 'USA:%2008%') + AND n.gender ='f' + AND n.name LIKE '%Angel%' + AND rt.role ='actress' + AND t.production_year BETWEEN 2007 AND 2008 + AND t.title LIKE '%Kung%Fu%Panda%' + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mi.movie_id = ci.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/19c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/19c.sql new file mode 100644 index 00000000..c8139bc7 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/19c.sql @@ -0,0 +1,39 @@ +SELECT MIN(n.name) AS voicing_actress, + MIN(t.title) AS jap_engl_voiced_movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%200%' + OR mi.info LIKE 'USA:%200%') + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND t.production_year > 2000 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mi.movie_id = ci.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/19d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/19d.sql new file mode 100644 index 00000000..03c7a850 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/19d.sql @@ -0,0 +1,35 @@ +SELECT MIN(n.name) AS voicing_actress, + MIN(t.title) AS jap_engl_voiced_movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND n.gender ='f' + AND rt.role ='actress' + AND t.production_year > 2000 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mi.movie_id = ci.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/1a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/1a.sql new file mode 100644 index 00000000..cb464bb5 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/1a.sql @@ -0,0 +1,19 @@ +SELECT MIN(mc.note) AS production_note, + MIN(t.title) AS movie_title, + MIN(t.production_year) AS movie_year +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info_idx AS mi_idx, + title AS t +WHERE ct.kind = 'production companies' + AND it.info = 'top 250 rank' + AND mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%' + AND (mc.note LIKE '%(co-production)%' + OR mc.note LIKE '%(presents)%') + AND ct.id = mc.company_type_id + AND t.id = mc.movie_id + AND t.id = mi_idx.movie_id + AND mc.movie_id = mi_idx.movie_id + AND it.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/1b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/1b.sql new file mode 100644 index 00000000..c471f862 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/1b.sql @@ -0,0 +1,18 @@ +SELECT MIN(mc.note) AS production_note, + MIN(t.title) AS movie_title, + MIN(t.production_year) AS movie_year +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info_idx AS mi_idx, + title AS t +WHERE ct.kind = 'production companies' + AND it.info = 'bottom 10 rank' + AND mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%' + AND t.production_year BETWEEN 2005 AND 2010 + AND ct.id = mc.company_type_id + AND t.id = mc.movie_id + AND t.id = mi_idx.movie_id + AND mc.movie_id = mi_idx.movie_id + AND it.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/1c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/1c.sql new file mode 100644 index 00000000..81b3b849 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/1c.sql @@ -0,0 +1,18 @@ +SELECT MIN(mc.note) AS production_note, + MIN(t.title) AS movie_title, + MIN(t.production_year) AS movie_year +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info_idx AS mi_idx, + title AS t +WHERE ct.kind = 'production companies' + AND it.info = 'top 250 rank' + AND mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%' + AND (mc.note LIKE '%(co-production)%') + AND t.production_year >2010 + AND ct.id = mc.company_type_id + AND t.id = mc.movie_id + AND t.id = mi_idx.movie_id + AND mc.movie_id = mi_idx.movie_id + AND it.id = mi_idx.info_type_id; \ No newline at end of file diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/1d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/1d.sql new file mode 100644 index 00000000..96a6eda9 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/1d.sql @@ -0,0 +1,18 @@ +SELECT MIN(mc.note) AS production_note, + MIN(t.title) AS movie_title, + MIN(t.production_year) AS movie_year +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info_idx AS mi_idx, + title AS t +WHERE ct.kind = 'production companies' + AND it.info = 'bottom 10 rank' + AND mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%' + AND t.production_year >2000 + AND ct.id = mc.company_type_id + AND t.id = mc.movie_id + AND t.id = mi_idx.movie_id + AND mc.movie_id = mi_idx.movie_id + AND it.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/20a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/20a.sql new file mode 100644 index 00000000..3eaedce6 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/20a.sql @@ -0,0 +1,39 @@ +SELECT MIN(t.title) AS complete_downey_ironman_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + keyword AS k, + kind_type AS kt, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE '%complete%' + AND chn.name NOT LIKE '%Sherlock%' + AND (chn.name LIKE '%Tony%Stark%' + OR chn.name LIKE '%Iron%Man%') + AND k.keyword IN ('superhero', + 'sequel', + 'second-part', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence') + AND kt.kind = 'movie' + AND t.production_year > 1950 + AND kt.id = t.kind_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = ci.movie_id + AND mk.movie_id = cc.movie_id + AND ci.movie_id = cc.movie_id + AND chn.id = ci.person_role_id + AND n.id = ci.person_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/20b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/20b.sql new file mode 100644 index 00000000..68c038ca --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/20b.sql @@ -0,0 +1,40 @@ +SELECT MIN(t.title) AS complete_downey_ironman_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + keyword AS k, + kind_type AS kt, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE '%complete%' + AND chn.name NOT LIKE '%Sherlock%' + AND (chn.name LIKE '%Tony%Stark%' + OR chn.name LIKE '%Iron%Man%') + AND k.keyword IN ('superhero', + 'sequel', + 'second-part', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence') + AND kt.kind = 'movie' + AND n.name LIKE '%Downey%Robert%' + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = ci.movie_id + AND mk.movie_id = cc.movie_id + AND ci.movie_id = cc.movie_id + AND chn.id = ci.person_role_id + AND n.id = ci.person_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/20c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/20c.sql new file mode 100644 index 00000000..14bc3665 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/20c.sql @@ -0,0 +1,42 @@ +SELECT MIN(n.name) AS cast_member, + MIN(t.title) AS complete_dynamic_hero_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + keyword AS k, + kind_type AS kt, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE '%complete%' + AND chn.name IS NOT NULL + AND (chn.name LIKE '%man%' + OR chn.name LIKE '%Man%') + AND k.keyword IN ('superhero', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence', + 'magnet', + 'web', + 'claw', + 'laser') + AND kt.kind = 'movie' + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = ci.movie_id + AND mk.movie_id = cc.movie_id + AND ci.movie_id = cc.movie_id + AND chn.id = ci.person_role_id + AND n.id = ci.person_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/21a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/21a.sql new file mode 100644 index 00000000..e0b5c137 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/21a.sql @@ -0,0 +1,43 @@ +SELECT MIN(cn.name) AS company_name, + MIN(lt.link) AS link_type, + MIN(t.title) AS western_follow_up +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German') + AND t.production_year BETWEEN 1950 AND 2000 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND mi.movie_id = t.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND ml.movie_id = mi.movie_id + AND mk.movie_id = mi.movie_id + AND mc.movie_id = mi.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/21b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/21b.sql new file mode 100644 index 00000000..ffd2dcf8 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/21b.sql @@ -0,0 +1,37 @@ +SELECT MIN(cn.name) AS company_name, + MIN(lt.link) AS link_type, + MIN(t.title) AS german_follow_up +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND mi.info IN ('Germany', + 'German') + AND t.production_year BETWEEN 2000 AND 2010 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND mi.movie_id = t.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND ml.movie_id = mi.movie_id + AND mk.movie_id = mi.movie_id + AND mc.movie_id = mi.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/21c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/21c.sql new file mode 100644 index 00000000..b7cc4dca --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/21c.sql @@ -0,0 +1,44 @@ +SELECT MIN(cn.name) AS company_name, + MIN(lt.link) AS link_type, + MIN(t.title) AS western_follow_up +FROM company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German', + 'English') + AND t.production_year BETWEEN 1950 AND 2010 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND mi.movie_id = t.id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND ml.movie_id = mi.movie_id + AND mk.movie_id = mi.movie_id + AND mc.movie_id = mi.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/22a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/22a.sql new file mode 100644 index 00000000..19d361d3 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/22a.sql @@ -0,0 +1,48 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS western_violent_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mc.note NOT LIKE '%(USA)%' + AND mc.note LIKE '%(200%)%' + AND mi.info IN ('Germany', + 'German', + 'USA', + 'American') + AND mi_idx.info < '7.0' + AND t.production_year > 2008 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/22b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/22b.sql new file mode 100644 index 00000000..bfd174e3 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/22b.sql @@ -0,0 +1,48 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS western_violent_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mc.note NOT LIKE '%(USA)%' + AND mc.note LIKE '%(200%)%' + AND mi.info IN ('Germany', + 'German', + 'USA', + 'American') + AND mi_idx.info < '7.0' + AND t.production_year > 2009 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/22c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/22c.sql new file mode 100644 index 00000000..eacec5e0 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/22c.sql @@ -0,0 +1,54 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS western_violent_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mc.note NOT LIKE '%(USA)%' + AND mc.note LIKE '%(200%)%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Danish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info < '8.5' + AND t.production_year > 2005 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/22d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/22d.sql new file mode 100644 index 00000000..3442b43f --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/22d.sql @@ -0,0 +1,52 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS western_violent_movie +FROM company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Danish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info < '8.5' + AND t.production_year > 2005 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/23a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/23a.sql new file mode 100644 index 00000000..60789f38 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/23a.sql @@ -0,0 +1,39 @@ +SELECT MIN(kt.kind) AS movie_kind, + MIN(t.title) AS complete_us_internet_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cct1.kind = 'complete+verified' + AND cn.country_code = '[us]' + AND it1.info = 'release dates' + AND kt.kind IN ('movie') + AND mi.note LIKE '%internet%' + AND mi.info IS NOT NULL + AND (mi.info LIKE 'USA:% 199%' + OR mi.info LIKE 'USA:% 200%') + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = cc.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = cc.movie_id + AND mc.movie_id = cc.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND cct1.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/23b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/23b.sql new file mode 100644 index 00000000..f1ba168f --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/23b.sql @@ -0,0 +1,41 @@ +SELECT MIN(kt.kind) AS movie_kind, + MIN(t.title) AS complete_nerdy_internet_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cct1.kind = 'complete+verified' + AND cn.country_code = '[us]' + AND it1.info = 'release dates' + AND k.keyword IN ('nerd', + 'loner', + 'alienation', + 'dignity') + AND kt.kind IN ('movie') + AND mi.note LIKE '%internet%' + AND mi.info LIKE 'USA:% 200%' + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = cc.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = cc.movie_id + AND mc.movie_id = cc.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND cct1.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/23c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/23c.sql new file mode 100644 index 00000000..26895f48 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/23c.sql @@ -0,0 +1,42 @@ +SELECT MIN(kt.kind) AS movie_kind, + MIN(t.title) AS complete_us_internet_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + company_name AS cn, + company_type AS ct, + info_type AS it1, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE cct1.kind = 'complete+verified' + AND cn.country_code = '[us]' + AND it1.info = 'release dates' + AND kt.kind IN ('movie', + 'tv movie', + 'video movie', + 'video game') + AND mi.note LIKE '%internet%' + AND mi.info IS NOT NULL + AND (mi.info LIKE 'USA:% 199%' + OR mi.info LIKE 'USA:% 200%') + AND t.production_year > 1990 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = cc.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = cc.movie_id + AND mc.movie_id = cc.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND cn.id = mc.company_id + AND ct.id = mc.company_type_id + AND cct1.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/24a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/24a.sql new file mode 100644 index 00000000..ba0df9c6 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/24a.sql @@ -0,0 +1,50 @@ +SELECT MIN(chn.name) AS voiced_char_name, + MIN(n.name) AS voicing_actress_name, + MIN(t.title) AS voiced_action_movie_jap_eng +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND k.keyword IN ('hero', + 'martial-arts', + 'hand-to-hand-combat') + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%201%' + OR mi.info LIKE 'USA:%201%') + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND t.production_year > 2010 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mk.movie_id + AND mi.movie_id = ci.movie_id + AND mi.movie_id = mk.movie_id + AND ci.movie_id = mk.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/24b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/24b.sql new file mode 100644 index 00000000..ed2482e1 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/24b.sql @@ -0,0 +1,53 @@ +SELECT MIN(chn.name) AS voiced_char_name, + MIN(n.name) AS voicing_actress_name, + MIN(t.title) AS kung_fu_panda +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND cn.name = 'DreamWorks Animation' + AND it.info = 'release dates' + AND k.keyword IN ('hero', + 'martial-arts', + 'hand-to-hand-combat', + 'computer-animated-movie') + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%201%' + OR mi.info LIKE 'USA:%201%') + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND t.production_year > 2010 + AND t.title LIKE 'Kung Fu Panda%' + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mk.movie_id + AND mi.movie_id = ci.movie_id + AND mi.movie_id = mk.movie_id + AND ci.movie_id = mk.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/25a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/25a.sql new file mode 100644 index 00000000..80f52d0a --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/25a.sql @@ -0,0 +1,42 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS male_writer, + MIN(t.title) AS violent_movie_title +FROM cast_info AS ci, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'blood', + 'gore', + 'death', + 'female-nudity') + AND mi.info = 'Horror' + AND n.gender = 'm' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi_idx.movie_id = mk.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/25b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/25b.sql new file mode 100644 index 00000000..e752c3ef --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/25b.sql @@ -0,0 +1,44 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS male_writer, + MIN(t.title) AS violent_movie_title +FROM cast_info AS ci, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'blood', + 'gore', + 'death', + 'female-nudity') + AND mi.info = 'Horror' + AND n.gender = 'm' + AND t.production_year > 2010 + AND t.title LIKE 'Vampire%' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi_idx.movie_id = mk.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/25c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/25c.sql new file mode 100644 index 00000000..3f0fc18e --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/25c.sql @@ -0,0 +1,49 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS male_writer, + MIN(t.title) AS violent_movie_title +FROM cast_info AS ci, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mi.info IN ('Horror', + 'Action', + 'Sci-Fi', + 'Thriller', + 'Crime', + 'War') + AND n.gender = 'm' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi_idx.movie_id = mk.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/26a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/26a.sql new file mode 100644 index 00000000..3dc3c329 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/26a.sql @@ -0,0 +1,53 @@ +SELECT MIN(chn.name) AS character_name, + MIN(mi_idx.info) AS rating, + MIN(n.name) AS playing_actor, + MIN(t.title) AS complete_hero_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE '%complete%' + AND chn.name IS NOT NULL + AND (chn.name LIKE '%man%' + OR chn.name LIKE '%Man%') + AND it2.info = 'rating' + AND k.keyword IN ('superhero', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence', + 'magnet', + 'web', + 'claw', + 'laser') + AND kt.kind = 'movie' + AND mi_idx.info > '7.0' + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND t.id = cc.movie_id + AND t.id = mi_idx.movie_id + AND mk.movie_id = ci.movie_id + AND mk.movie_id = cc.movie_id + AND mk.movie_id = mi_idx.movie_id + AND ci.movie_id = cc.movie_id + AND ci.movie_id = mi_idx.movie_id + AND cc.movie_id = mi_idx.movie_id + AND chn.id = ci.person_role_id + AND n.id = ci.person_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/26b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/26b.sql new file mode 100644 index 00000000..71e912e4 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/26b.sql @@ -0,0 +1,46 @@ +SELECT MIN(chn.name) AS character_name, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS complete_hero_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE '%complete%' + AND chn.name IS NOT NULL + AND (chn.name LIKE '%man%' + OR chn.name LIKE '%Man%') + AND it2.info = 'rating' + AND k.keyword IN ('superhero', + 'marvel-comics', + 'based-on-comic', + 'fight') + AND kt.kind = 'movie' + AND mi_idx.info > '8.0' + AND t.production_year > 2005 + AND kt.id = t.kind_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND t.id = cc.movie_id + AND t.id = mi_idx.movie_id + AND mk.movie_id = ci.movie_id + AND mk.movie_id = cc.movie_id + AND mk.movie_id = mi_idx.movie_id + AND ci.movie_id = cc.movie_id + AND ci.movie_id = mi_idx.movie_id + AND cc.movie_id = mi_idx.movie_id + AND chn.id = ci.person_role_id + AND n.id = ci.person_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/26c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/26c.sql new file mode 100644 index 00000000..45e36c64 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/26c.sql @@ -0,0 +1,51 @@ +SELECT MIN(chn.name) AS character_name, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS complete_hero_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE '%complete%' + AND chn.name IS NOT NULL + AND (chn.name LIKE '%man%' + OR chn.name LIKE '%Man%') + AND it2.info = 'rating' + AND k.keyword IN ('superhero', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence', + 'magnet', + 'web', + 'claw', + 'laser') + AND kt.kind = 'movie' + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND t.id = cc.movie_id + AND t.id = mi_idx.movie_id + AND mk.movie_id = ci.movie_id + AND mk.movie_id = cc.movie_id + AND mk.movie_id = mi_idx.movie_id + AND ci.movie_id = cc.movie_id + AND ci.movie_id = mi_idx.movie_id + AND cc.movie_id = mi_idx.movie_id + AND chn.id = ci.person_role_id + AND n.id = ci.person_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id + AND it2.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/27a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/27a.sql new file mode 100644 index 00000000..091da711 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/27a.sql @@ -0,0 +1,52 @@ +SELECT MIN(cn.name) AS producing_company, + MIN(lt.link) AS link_type, + MIN(t.title) AS complete_western_sequel +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cct1.kind IN ('cast', + 'crew') + AND cct2.kind = 'complete' + AND cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND mi.info IN ('Sweden', + 'Germany', + 'Swedish', + 'German') + AND t.production_year BETWEEN 1950 AND 2000 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND mi.movie_id = t.id + AND t.id = cc.movie_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND ml.movie_id = mi.movie_id + AND mk.movie_id = mi.movie_id + AND mc.movie_id = mi.movie_id + AND ml.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND mc.movie_id = cc.movie_id + AND mi.movie_id = cc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/27b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/27b.sql new file mode 100644 index 00000000..580ab710 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/27b.sql @@ -0,0 +1,52 @@ +SELECT MIN(cn.name) AS producing_company, + MIN(lt.link) AS link_type, + MIN(t.title) AS complete_western_sequel +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cct1.kind IN ('cast', + 'crew') + AND cct2.kind = 'complete' + AND cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND mi.info IN ('Sweden', + 'Germany', + 'Swedish', + 'German') + AND t.production_year = 1998 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND mi.movie_id = t.id + AND t.id = cc.movie_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND ml.movie_id = mi.movie_id + AND mk.movie_id = mi.movie_id + AND mc.movie_id = mi.movie_id + AND ml.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND mc.movie_id = cc.movie_id + AND mi.movie_id = cc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/27c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/27c.sql new file mode 100644 index 00000000..f070c817 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/27c.sql @@ -0,0 +1,56 @@ +SELECT MIN(cn.name) AS producing_company, + MIN(lt.link) AS link_type, + MIN(t.title) AS complete_western_sequel +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + company_name AS cn, + company_type AS ct, + keyword AS k, + link_type AS lt, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + movie_link AS ml, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind LIKE 'complete%' + AND cn.country_code !='[pl]' + AND (cn.name LIKE '%Film%' + OR cn.name LIKE '%Warner%') + AND ct.kind ='production companies' + AND k.keyword ='sequel' + AND lt.link LIKE '%follow%' + AND mc.note IS NULL + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German', + 'English') + AND t.production_year BETWEEN 1950 AND 2010 + AND lt.id = ml.link_type_id + AND ml.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND t.id = mc.movie_id + AND mc.company_type_id = ct.id + AND mc.company_id = cn.id + AND mi.movie_id = t.id + AND t.id = cc.movie_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id + AND ml.movie_id = mk.movie_id + AND ml.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND ml.movie_id = mi.movie_id + AND mk.movie_id = mi.movie_id + AND mc.movie_id = mi.movie_id + AND ml.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND mc.movie_id = cc.movie_id + AND mi.movie_id = cc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/28a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/28a.sql new file mode 100644 index 00000000..5fea698d --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/28a.sql @@ -0,0 +1,66 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS complete_euro_dark_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cct1.kind = 'crew' + AND cct2.kind != 'complete+verified' + AND cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mc.note NOT LIKE '%(USA)%' + AND mc.note LIKE '%(200%)%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Danish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info < '8.5' + AND t.production_year > 2000 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = cc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = cc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND mc.movie_id = cc.movie_id + AND mi_idx.movie_id = cc.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/28b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/28b.sql new file mode 100644 index 00000000..c368ab93 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/28b.sql @@ -0,0 +1,60 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS complete_euro_dark_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cct1.kind = 'crew' + AND cct2.kind != 'complete+verified' + AND cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mc.note NOT LIKE '%(USA)%' + AND mc.note LIKE '%(200%)%' + AND mi.info IN ('Sweden', + 'Germany', + 'Swedish', + 'German') + AND mi_idx.info > '6.5' + AND t.production_year > 2005 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = cc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = cc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND mc.movie_id = cc.movie_id + AND mi_idx.movie_id = cc.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/28c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/28c.sql new file mode 100644 index 00000000..796852a2 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/28c.sql @@ -0,0 +1,66 @@ +SELECT MIN(cn.name) AS movie_company, + MIN(mi_idx.info) AS rating, + MIN(t.title) AS complete_euro_dark_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + company_name AS cn, + company_type AS ct, + info_type AS it1, + info_type AS it2, + keyword AS k, + kind_type AS kt, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind = 'complete' + AND cn.country_code != '[us]' + AND it1.info = 'countries' + AND it2.info = 'rating' + AND k.keyword IN ('murder', + 'murder-in-title', + 'blood', + 'violence') + AND kt.kind IN ('movie', + 'episode') + AND mc.note NOT LIKE '%(USA)%' + AND mc.note LIKE '%(200%)%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Danish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND mi_idx.info < '8.5' + AND t.production_year > 2005 + AND kt.id = t.kind_id + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND t.id = mi_idx.movie_id + AND t.id = mc.movie_id + AND t.id = cc.movie_id + AND mk.movie_id = mi.movie_id + AND mk.movie_id = mi_idx.movie_id + AND mk.movie_id = mc.movie_id + AND mk.movie_id = cc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mc.movie_id + AND mi.movie_id = cc.movie_id + AND mc.movie_id = mi_idx.movie_id + AND mc.movie_id = cc.movie_id + AND mi_idx.movie_id = cc.movie_id + AND k.id = mk.keyword_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND ct.id = mc.company_type_id + AND cn.id = mc.company_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/29a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/29a.sql new file mode 100644 index 00000000..b4d60298 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/29a.sql @@ -0,0 +1,67 @@ +SELECT MIN(chn.name) AS voiced_char, + MIN(n.name) AS voicing_actress, + MIN(t.title) AS voiced_animation +FROM aka_name AS an, + complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + info_type AS it3, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + name AS n, + person_info AS pi, + role_type AS rt, + title AS t +WHERE cct1.kind ='cast' + AND cct2.kind ='complete+verified' + AND chn.name = 'Queen' + AND ci.note IN ('(voice)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND it3.info = 'trivia' + AND k.keyword = 'computer-animation' + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%200%' + OR mi.info LIKE 'USA:%200%') + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND t.title = 'Shrek 2' + AND t.production_year BETWEEN 2000 AND 2010 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = cc.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mk.movie_id + AND mc.movie_id = cc.movie_id + AND mi.movie_id = ci.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = cc.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id + AND n.id = pi.person_id + AND ci.person_id = pi.person_id + AND it3.id = pi.info_type_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/29b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/29b.sql new file mode 100644 index 00000000..c33d9677 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/29b.sql @@ -0,0 +1,65 @@ +SELECT MIN(chn.name) AS voiced_char, + MIN(n.name) AS voicing_actress, + MIN(t.title) AS voiced_animation +FROM aka_name AS an, + complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + info_type AS it3, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + name AS n, + person_info AS pi, + role_type AS rt, + title AS t +WHERE cct1.kind ='cast' + AND cct2.kind ='complete+verified' + AND chn.name = 'Queen' + AND ci.note IN ('(voice)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND it3.info = 'height' + AND k.keyword = 'computer-animation' + AND mi.info LIKE 'USA:%200%' + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND t.title = 'Shrek 2' + AND t.production_year BETWEEN 2000 AND 2005 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = cc.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mk.movie_id + AND mc.movie_id = cc.movie_id + AND mi.movie_id = ci.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = cc.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id + AND n.id = pi.person_id + AND ci.person_id = pi.person_id + AND it3.id = pi.info_type_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/29c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/29c.sql new file mode 100644 index 00000000..b2ef8785 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/29c.sql @@ -0,0 +1,66 @@ +SELECT MIN(chn.name) AS voiced_char, + MIN(n.name) AS voicing_actress, + MIN(t.title) AS voiced_animation +FROM aka_name AS an, + complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + info_type AS it, + info_type AS it3, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_keyword AS mk, + name AS n, + person_info AS pi, + role_type AS rt, + title AS t +WHERE cct1.kind ='cast' + AND cct2.kind ='complete+verified' + AND ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND it.info = 'release dates' + AND it3.info = 'trivia' + AND k.keyword = 'computer-animation' + AND mi.info IS NOT NULL + AND (mi.info LIKE 'Japan:%200%' + OR mi.info LIKE 'USA:%200%') + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND t.production_year BETWEEN 2000 AND 2010 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = cc.movie_id + AND mc.movie_id = ci.movie_id + AND mc.movie_id = mi.movie_id + AND mc.movie_id = mk.movie_id + AND mc.movie_id = cc.movie_id + AND mi.movie_id = ci.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = cc.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND cn.id = mc.company_id + AND it.id = mi.info_type_id + AND n.id = ci.person_id + AND rt.id = ci.role_id + AND n.id = an.person_id + AND ci.person_id = an.person_id + AND chn.id = ci.person_role_id + AND n.id = pi.person_id + AND ci.person_id = pi.person_id + AND it3.id = pi.info_type_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/2a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/2a.sql new file mode 100644 index 00000000..6e6b46ed --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/2a.sql @@ -0,0 +1,14 @@ +SELECT MIN(t.title) AS movie_title +FROM company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + title AS t +WHERE cn.country_code ='[de]' + AND k.keyword ='character-name-in-title' + AND cn.id = mc.company_id + AND mc.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/2b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/2b.sql new file mode 100644 index 00000000..1c594b46 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/2b.sql @@ -0,0 +1,14 @@ +SELECT MIN(t.title) AS movie_title +FROM company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + title AS t +WHERE cn.country_code ='[nl]' + AND k.keyword ='character-name-in-title' + AND cn.id = mc.company_id + AND mc.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/2c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/2c.sql new file mode 100644 index 00000000..905453e8 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/2c.sql @@ -0,0 +1,14 @@ +SELECT MIN(t.title) AS movie_title +FROM company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + title AS t +WHERE cn.country_code ='[sm]' + AND k.keyword ='character-name-in-title' + AND cn.id = mc.company_id + AND mc.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/2d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/2d.sql new file mode 100644 index 00000000..b7f7cedf --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/2d.sql @@ -0,0 +1,14 @@ +SELECT MIN(t.title) AS movie_title +FROM company_name AS cn, + keyword AS k, + movie_companies AS mc, + movie_keyword AS mk, + title AS t +WHERE cn.country_code ='[us]' + AND k.keyword ='character-name-in-title' + AND cn.id = mc.company_id + AND mc.movie_id = t.id + AND t.id = mk.movie_id + AND mk.keyword_id = k.id + AND mc.movie_id = mk.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/30a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/30a.sql new file mode 100644 index 00000000..7b45ac55 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/30a.sql @@ -0,0 +1,59 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS writer, + MIN(t.title) AS complete_violent_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + cast_info AS ci, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind IN ('cast', + 'crew') + AND cct2.kind ='complete+verified' + AND ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mi.info IN ('Horror', + 'Thriller') + AND n.gender = 'm' + AND t.production_year > 2000 + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = cc.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = cc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = cc.movie_id + AND mi_idx.movie_id = mk.movie_id + AND mi_idx.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/30b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/30b.sql new file mode 100644 index 00000000..91cfa290 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/30b.sql @@ -0,0 +1,62 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS writer, + MIN(t.title) AS complete_gore_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + cast_info AS ci, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind IN ('cast', + 'crew') + AND cct2.kind ='complete+verified' + AND ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mi.info IN ('Horror', + 'Thriller') + AND n.gender = 'm' + AND t.production_year > 2000 + AND (t.title LIKE '%Freddy%' + OR t.title LIKE '%Jason%' + OR t.title LIKE 'Saw%') + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = cc.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = cc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = cc.movie_id + AND mi_idx.movie_id = mk.movie_id + AND mi_idx.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/30c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/30c.sql new file mode 100644 index 00000000..bfa134b9 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/30c.sql @@ -0,0 +1,61 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS writer, + MIN(t.title) AS complete_violent_movie +FROM complete_cast AS cc, + comp_cast_type AS cct1, + comp_cast_type AS cct2, + cast_info AS ci, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE cct1.kind = 'cast' + AND cct2.kind ='complete+verified' + AND ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mi.info IN ('Horror', + 'Action', + 'Sci-Fi', + 'Thriller', + 'Crime', + 'War') + AND n.gender = 'm' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = cc.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = cc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = cc.movie_id + AND mi_idx.movie_id = mk.movie_id + AND mi_idx.movie_id = cc.movie_id + AND mk.movie_id = cc.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id + AND cct1.id = cc.subject_id + AND cct2.id = cc.status_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/31a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/31a.sql new file mode 100644 index 00000000..da3f1380 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/31a.sql @@ -0,0 +1,54 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS writer, + MIN(t.title) AS violent_liongate_movie +FROM cast_info AS ci, + company_name AS cn, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND cn.name LIKE 'Lionsgate%' + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mi.info IN ('Horror', + 'Thriller') + AND n.gender = 'm' + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = mc.movie_id + AND mi_idx.movie_id = mk.movie_id + AND mi_idx.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/31b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/31b.sql new file mode 100644 index 00000000..a6c506e2 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/31b.sql @@ -0,0 +1,59 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS writer, + MIN(t.title) AS violent_liongate_movie +FROM cast_info AS ci, + company_name AS cn, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND cn.name LIKE 'Lionsgate%' + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mc.note LIKE '%(Blu-ray)%' + AND mi.info IN ('Horror', + 'Thriller') + AND n.gender = 'm' + AND t.production_year > 2000 + AND (t.title LIKE '%Freddy%' + OR t.title LIKE '%Jason%' + OR t.title LIKE 'Saw%') + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = mc.movie_id + AND mi_idx.movie_id = mk.movie_id + AND mi_idx.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/31c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/31c.sql new file mode 100644 index 00000000..d96d20ca --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/31c.sql @@ -0,0 +1,57 @@ +SELECT MIN(mi.info) AS movie_budget, + MIN(mi_idx.info) AS movie_votes, + MIN(n.name) AS writer, + MIN(t.title) AS violent_liongate_movie +FROM cast_info AS ci, + company_name AS cn, + info_type AS it1, + info_type AS it2, + keyword AS k, + movie_companies AS mc, + movie_info AS mi, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + name AS n, + title AS t +WHERE ci.note IN ('(writer)', + '(head writer)', + '(written by)', + '(story)', + '(story editor)') + AND cn.name LIKE 'Lionsgate%' + AND it1.info = 'genres' + AND it2.info = 'votes' + AND k.keyword IN ('murder', + 'violence', + 'blood', + 'gore', + 'death', + 'female-nudity', + 'hospital') + AND mi.info IN ('Horror', + 'Action', + 'Sci-Fi', + 'Thriller', + 'Crime', + 'War') + AND t.id = mi.movie_id + AND t.id = mi_idx.movie_id + AND t.id = ci.movie_id + AND t.id = mk.movie_id + AND t.id = mc.movie_id + AND ci.movie_id = mi.movie_id + AND ci.movie_id = mi_idx.movie_id + AND ci.movie_id = mk.movie_id + AND ci.movie_id = mc.movie_id + AND mi.movie_id = mi_idx.movie_id + AND mi.movie_id = mk.movie_id + AND mi.movie_id = mc.movie_id + AND mi_idx.movie_id = mk.movie_id + AND mi_idx.movie_id = mc.movie_id + AND mk.movie_id = mc.movie_id + AND n.id = ci.person_id + AND it1.id = mi.info_type_id + AND it2.id = mi_idx.info_type_id + AND k.id = mk.keyword_id + AND cn.id = mc.company_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/32a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/32a.sql new file mode 100644 index 00000000..f099dac8 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/32a.sql @@ -0,0 +1,17 @@ +SELECT MIN(lt.link) AS link_type, + MIN(t1.title) AS first_movie, + MIN(t2.title) AS second_movie +FROM keyword AS k, + link_type AS lt, + movie_keyword AS mk, + movie_link AS ml, + title AS t1, + title AS t2 +WHERE k.keyword ='10,000-mile-club' + AND mk.keyword_id = k.id + AND t1.id = mk.movie_id + AND ml.movie_id = t1.id + AND ml.linked_movie_id = t2.id + AND lt.id = ml.link_type_id + AND mk.movie_id = t1.id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/32b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/32b.sql new file mode 100644 index 00000000..e5806e74 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/32b.sql @@ -0,0 +1,17 @@ +SELECT MIN(lt.link) AS link_type, + MIN(t1.title) AS first_movie, + MIN(t2.title) AS second_movie +FROM keyword AS k, + link_type AS lt, + movie_keyword AS mk, + movie_link AS ml, + title AS t1, + title AS t2 +WHERE k.keyword ='character-name-in-title' + AND mk.keyword_id = k.id + AND t1.id = mk.movie_id + AND ml.movie_id = t1.id + AND ml.linked_movie_id = t2.id + AND lt.id = ml.link_type_id + AND mk.movie_id = t1.id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/33a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/33a.sql new file mode 100644 index 00000000..4f63aa69 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/33a.sql @@ -0,0 +1,50 @@ +SELECT MIN(cn1.name) AS first_company, + MIN(cn2.name) AS second_company, + MIN(mi_idx1.info) AS first_rating, + MIN(mi_idx2.info) AS second_rating, + MIN(t1.title) AS first_movie, + MIN(t2.title) AS second_movie +FROM company_name AS cn1, + company_name AS cn2, + info_type AS it1, + info_type AS it2, + kind_type AS kt1, + kind_type AS kt2, + link_type AS lt, + movie_companies AS mc1, + movie_companies AS mc2, + movie_info_idx AS mi_idx1, + movie_info_idx AS mi_idx2, + movie_link AS ml, + title AS t1, + title AS t2 +WHERE cn1.country_code = '[us]' + AND it1.info = 'rating' + AND it2.info = 'rating' + AND kt1.kind IN ('tv series') + AND kt2.kind IN ('tv series') + AND lt.link IN ('sequel', + 'follows', + 'followed by') + AND mi_idx2.info < '3.0' + AND t2.production_year BETWEEN 2005 AND 2008 + AND lt.id = ml.link_type_id + AND t1.id = ml.movie_id + AND t2.id = ml.linked_movie_id + AND it1.id = mi_idx1.info_type_id + AND t1.id = mi_idx1.movie_id + AND kt1.id = t1.kind_id + AND cn1.id = mc1.company_id + AND t1.id = mc1.movie_id + AND ml.movie_id = mi_idx1.movie_id + AND ml.movie_id = mc1.movie_id + AND mi_idx1.movie_id = mc1.movie_id + AND it2.id = mi_idx2.info_type_id + AND t2.id = mi_idx2.movie_id + AND kt2.id = t2.kind_id + AND cn2.id = mc2.company_id + AND t2.id = mc2.movie_id + AND ml.linked_movie_id = mi_idx2.movie_id + AND ml.linked_movie_id = mc2.movie_id + AND mi_idx2.movie_id = mc2.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/33b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/33b.sql new file mode 100644 index 00000000..ae7a3f18 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/33b.sql @@ -0,0 +1,48 @@ +SELECT MIN(cn1.name) AS first_company, + MIN(cn2.name) AS second_company, + MIN(mi_idx1.info) AS first_rating, + MIN(mi_idx2.info) AS second_rating, + MIN(t1.title) AS first_movie, + MIN(t2.title) AS second_movie +FROM company_name AS cn1, + company_name AS cn2, + info_type AS it1, + info_type AS it2, + kind_type AS kt1, + kind_type AS kt2, + link_type AS lt, + movie_companies AS mc1, + movie_companies AS mc2, + movie_info_idx AS mi_idx1, + movie_info_idx AS mi_idx2, + movie_link AS ml, + title AS t1, + title AS t2 +WHERE cn1.country_code = '[nl]' + AND it1.info = 'rating' + AND it2.info = 'rating' + AND kt1.kind IN ('tv series') + AND kt2.kind IN ('tv series') + AND lt.link LIKE '%follow%' + AND mi_idx2.info < '3.0' + AND t2.production_year = 2007 + AND lt.id = ml.link_type_id + AND t1.id = ml.movie_id + AND t2.id = ml.linked_movie_id + AND it1.id = mi_idx1.info_type_id + AND t1.id = mi_idx1.movie_id + AND kt1.id = t1.kind_id + AND cn1.id = mc1.company_id + AND t1.id = mc1.movie_id + AND ml.movie_id = mi_idx1.movie_id + AND ml.movie_id = mc1.movie_id + AND mi_idx1.movie_id = mc1.movie_id + AND it2.id = mi_idx2.info_type_id + AND t2.id = mi_idx2.movie_id + AND kt2.id = t2.kind_id + AND cn2.id = mc2.company_id + AND t2.id = mc2.movie_id + AND ml.linked_movie_id = mi_idx2.movie_id + AND ml.linked_movie_id = mc2.movie_id + AND mi_idx2.movie_id = mc2.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/33c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/33c.sql new file mode 100644 index 00000000..fd4d62cc --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/33c.sql @@ -0,0 +1,52 @@ +SELECT MIN(cn1.name) AS first_company, + MIN(cn2.name) AS second_company, + MIN(mi_idx1.info) AS first_rating, + MIN(mi_idx2.info) AS second_rating, + MIN(t1.title) AS first_movie, + MIN(t2.title) AS second_movie +FROM company_name AS cn1, + company_name AS cn2, + info_type AS it1, + info_type AS it2, + kind_type AS kt1, + kind_type AS kt2, + link_type AS lt, + movie_companies AS mc1, + movie_companies AS mc2, + movie_info_idx AS mi_idx1, + movie_info_idx AS mi_idx2, + movie_link AS ml, + title AS t1, + title AS t2 +WHERE cn1.country_code != '[us]' + AND it1.info = 'rating' + AND it2.info = 'rating' + AND kt1.kind IN ('tv series', + 'episode') + AND kt2.kind IN ('tv series', + 'episode') + AND lt.link IN ('sequel', + 'follows', + 'followed by') + AND mi_idx2.info < '3.5' + AND t2.production_year BETWEEN 2000 AND 2010 + AND lt.id = ml.link_type_id + AND t1.id = ml.movie_id + AND t2.id = ml.linked_movie_id + AND it1.id = mi_idx1.info_type_id + AND t1.id = mi_idx1.movie_id + AND kt1.id = t1.kind_id + AND cn1.id = mc1.company_id + AND t1.id = mc1.movie_id + AND ml.movie_id = mi_idx1.movie_id + AND ml.movie_id = mc1.movie_id + AND mi_idx1.movie_id = mc1.movie_id + AND it2.id = mi_idx2.info_type_id + AND t2.id = mi_idx2.movie_id + AND kt2.id = t2.kind_id + AND cn2.id = mc2.company_id + AND t2.id = mc2.movie_id + AND ml.linked_movie_id = mi_idx2.movie_id + AND ml.linked_movie_id = mc2.movie_id + AND mi_idx2.movie_id = mc2.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/3a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/3a.sql new file mode 100644 index 00000000..d10f53b4 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/3a.sql @@ -0,0 +1,20 @@ +SELECT MIN(t.title) AS movie_title +FROM keyword AS k, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE k.keyword LIKE '%sequel%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German') + AND t.production_year > 2005 + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND mk.movie_id = mi.movie_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/3b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/3b.sql new file mode 100644 index 00000000..d50d14a7 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/3b.sql @@ -0,0 +1,13 @@ +SELECT MIN(t.title) AS movie_title +FROM keyword AS k, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE k.keyword LIKE '%sequel%' + AND mi.info IN ('Bulgaria') + AND t.production_year > 2010 + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND mk.movie_id = mi.movie_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/3c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/3c.sql new file mode 100644 index 00000000..44efbc83 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/3c.sql @@ -0,0 +1,22 @@ +SELECT MIN(t.title) AS movie_title +FROM keyword AS k, + movie_info AS mi, + movie_keyword AS mk, + title AS t +WHERE k.keyword LIKE '%sequel%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND t.production_year > 1990 + AND t.id = mi.movie_id + AND t.id = mk.movie_id + AND mk.movie_id = mi.movie_id + AND k.id = mk.keyword_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/4a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/4a.sql new file mode 100644 index 00000000..bac1b786 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/4a.sql @@ -0,0 +1,17 @@ +SELECT MIN(mi_idx.info) AS rating, + MIN(t.title) AS movie_title +FROM info_type AS it, + keyword AS k, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE it.info ='rating' + AND k.keyword LIKE '%sequel%' + AND mi_idx.info > '5.0' + AND t.production_year > 2005 + AND t.id = mi_idx.movie_id + AND t.id = mk.movie_id + AND mk.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/4b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/4b.sql new file mode 100644 index 00000000..d108d8d8 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/4b.sql @@ -0,0 +1,17 @@ +SELECT MIN(mi_idx.info) AS rating, + MIN(t.title) AS movie_title +FROM info_type AS it, + keyword AS k, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE it.info ='rating' + AND k.keyword LIKE '%sequel%' + AND mi_idx.info > '9.0' + AND t.production_year > 2010 + AND t.id = mi_idx.movie_id + AND t.id = mk.movie_id + AND mk.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/4c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/4c.sql new file mode 100644 index 00000000..7cf7c97e --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/4c.sql @@ -0,0 +1,17 @@ +SELECT MIN(mi_idx.info) AS rating, + MIN(t.title) AS movie_title +FROM info_type AS it, + keyword AS k, + movie_info_idx AS mi_idx, + movie_keyword AS mk, + title AS t +WHERE it.info ='rating' + AND k.keyword LIKE '%sequel%' + AND mi_idx.info > '2.0' + AND t.production_year > 1990 + AND t.id = mi_idx.movie_id + AND t.id = mk.movie_id + AND mk.movie_id = mi_idx.movie_id + AND k.id = mk.keyword_id + AND it.id = mi_idx.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/5a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/5a.sql new file mode 100644 index 00000000..80ab76d0 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/5a.sql @@ -0,0 +1,24 @@ +SELECT MIN(t.title) AS typical_european_movie +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + title AS t +WHERE ct.kind = 'production companies' + AND mc.note LIKE '%(theatrical)%' + AND mc.note LIKE '%(France)%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German') + AND t.production_year > 2005 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND mc.movie_id = mi.movie_id + AND ct.id = mc.company_type_id + AND it.id = mi.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/5b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/5b.sql new file mode 100644 index 00000000..63e27e64 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/5b.sql @@ -0,0 +1,19 @@ +SELECT MIN(t.title) AS american_vhs_movie +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + title AS t +WHERE ct.kind = 'production companies' + AND mc.note LIKE '%(VHS)%' + AND mc.note LIKE '%(USA)%' + AND mc.note LIKE '%(1994)%' + AND mi.info IN ('USA', + 'America') + AND t.production_year > 2010 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND mc.movie_id = mi.movie_id + AND ct.id = mc.company_type_id + AND it.id = mi.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/5c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/5c.sql new file mode 100644 index 00000000..faabd4f7 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/5c.sql @@ -0,0 +1,26 @@ +SELECT MIN(t.title) AS american_movie +FROM company_type AS ct, + info_type AS it, + movie_companies AS mc, + movie_info AS mi, + title AS t +WHERE ct.kind = 'production companies' + AND mc.note NOT LIKE '%(TV)%' + AND mc.note LIKE '%(USA)%' + AND mi.info IN ('Sweden', + 'Norway', + 'Germany', + 'Denmark', + 'Swedish', + 'Denish', + 'Norwegian', + 'German', + 'USA', + 'American') + AND t.production_year > 1990 + AND t.id = mi.movie_id + AND t.id = mc.movie_id + AND mc.movie_id = mi.movie_id + AND ct.id = mc.company_type_id + AND it.id = mi.info_type_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/6a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/6a.sql new file mode 100644 index 00000000..dacef7c0 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/6a.sql @@ -0,0 +1,17 @@ +SELECT MIN(k.keyword) AS movie_keyword, + MIN(n.name) AS actor_name, + MIN(t.title) AS marvel_movie +FROM cast_info AS ci, + keyword AS k, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword = 'marvel-cinematic-universe' + AND n.name LIKE '%Downey%Robert%' + AND t.production_year > 2010 + AND k.id = mk.keyword_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mk.movie_id + AND n.id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/6b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/6b.sql new file mode 100644 index 00000000..011ab47a --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/6b.sql @@ -0,0 +1,24 @@ +SELECT MIN(k.keyword) AS movie_keyword, + MIN(n.name) AS actor_name, + MIN(t.title) AS hero_movie +FROM cast_info AS ci, + keyword AS k, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword IN ('superhero', + 'sequel', + 'second-part', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence') + AND n.name LIKE '%Downey%Robert%' + AND t.production_year > 2014 + AND k.id = mk.keyword_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mk.movie_id + AND n.id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/6c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/6c.sql new file mode 100644 index 00000000..cc55ef2b --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/6c.sql @@ -0,0 +1,17 @@ +SELECT MIN(k.keyword) AS movie_keyword, + MIN(n.name) AS actor_name, + MIN(t.title) AS marvel_movie +FROM cast_info AS ci, + keyword AS k, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword = 'marvel-cinematic-universe' + AND n.name LIKE '%Downey%Robert%' + AND t.production_year > 2014 + AND k.id = mk.keyword_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mk.movie_id + AND n.id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/6d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/6d.sql new file mode 100644 index 00000000..9b317e15 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/6d.sql @@ -0,0 +1,24 @@ +SELECT MIN(k.keyword) AS movie_keyword, + MIN(n.name) AS actor_name, + MIN(t.title) AS hero_movie +FROM cast_info AS ci, + keyword AS k, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword IN ('superhero', + 'sequel', + 'second-part', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence') + AND n.name LIKE '%Downey%Robert%' + AND t.production_year > 2000 + AND k.id = mk.keyword_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mk.movie_id + AND n.id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/6e.sql b/marble/environments/db_env_docker/join-order-benchmark-master/6e.sql new file mode 100644 index 00000000..5e0bc22c --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/6e.sql @@ -0,0 +1,17 @@ +SELECT MIN(k.keyword) AS movie_keyword, + MIN(n.name) AS actor_name, + MIN(t.title) AS marvel_movie +FROM cast_info AS ci, + keyword AS k, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword = 'marvel-cinematic-universe' + AND n.name LIKE '%Downey%Robert%' + AND t.production_year > 2000 + AND k.id = mk.keyword_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mk.movie_id + AND n.id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/6f.sql b/marble/environments/db_env_docker/join-order-benchmark-master/6f.sql new file mode 100644 index 00000000..32887f9b --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/6f.sql @@ -0,0 +1,23 @@ +SELECT MIN(k.keyword) AS movie_keyword, + MIN(n.name) AS actor_name, + MIN(t.title) AS hero_movie +FROM cast_info AS ci, + keyword AS k, + movie_keyword AS mk, + name AS n, + title AS t +WHERE k.keyword IN ('superhero', + 'sequel', + 'second-part', + 'marvel-comics', + 'based-on-comic', + 'tv-special', + 'fight', + 'violence') + AND t.production_year > 2000 + AND k.id = mk.keyword_id + AND t.id = mk.movie_id + AND t.id = ci.movie_id + AND ci.movie_id = mk.movie_id + AND n.id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/7a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/7a.sql new file mode 100644 index 00000000..27cdc801 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/7a.sql @@ -0,0 +1,31 @@ +SELECT MIN(n.name) AS of_person, + MIN(t.title) AS biography_movie +FROM aka_name AS an, + cast_info AS ci, + info_type AS it, + link_type AS lt, + movie_link AS ml, + name AS n, + person_info AS pi, + title AS t +WHERE an.name LIKE '%a%' + AND it.info ='mini biography' + AND lt.link ='features' + AND n.name_pcode_cf BETWEEN 'A' AND 'F' + AND (n.gender='m' + OR (n.gender = 'f' + AND n.name LIKE 'B%')) + AND pi.note ='Volker Boehm' + AND t.production_year BETWEEN 1980 AND 1995 + AND n.id = an.person_id + AND n.id = pi.person_id + AND ci.person_id = n.id + AND t.id = ci.movie_id + AND ml.linked_movie_id = t.id + AND lt.id = ml.link_type_id + AND it.id = pi.info_type_id + AND pi.person_id = an.person_id + AND pi.person_id = ci.person_id + AND an.person_id = ci.person_id + AND ci.movie_id = ml.linked_movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/7b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/7b.sql new file mode 100644 index 00000000..04dd3be4 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/7b.sql @@ -0,0 +1,29 @@ +SELECT MIN(n.name) AS of_person, + MIN(t.title) AS biography_movie +FROM aka_name AS an, + cast_info AS ci, + info_type AS it, + link_type AS lt, + movie_link AS ml, + name AS n, + person_info AS pi, + title AS t +WHERE an.name LIKE '%a%' + AND it.info ='mini biography' + AND lt.link ='features' + AND n.name_pcode_cf LIKE 'D%' + AND n.gender='m' + AND pi.note ='Volker Boehm' + AND t.production_year BETWEEN 1980 AND 1984 + AND n.id = an.person_id + AND n.id = pi.person_id + AND ci.person_id = n.id + AND t.id = ci.movie_id + AND ml.linked_movie_id = t.id + AND lt.id = ml.link_type_id + AND it.id = pi.info_type_id + AND pi.person_id = an.person_id + AND pi.person_id = ci.person_id + AND an.person_id = ci.person_id + AND ci.movie_id = ml.linked_movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/7c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/7c.sql new file mode 100644 index 00000000..c64785d3 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/7c.sql @@ -0,0 +1,36 @@ +SELECT MIN(n.name) AS cast_member_name, + MIN(pi.info) AS cast_member_info +FROM aka_name AS an, + cast_info AS ci, + info_type AS it, + link_type AS lt, + movie_link AS ml, + name AS n, + person_info AS pi, + title AS t +WHERE an.name IS NOT NULL + AND (an.name LIKE '%a%' + OR an.name LIKE 'A%') + AND it.info ='mini biography' + AND lt.link IN ('references', + 'referenced in', + 'features', + 'featured in') + AND n.name_pcode_cf BETWEEN 'A' AND 'F' + AND (n.gender='m' + OR (n.gender = 'f' + AND n.name LIKE 'A%')) + AND pi.note IS NOT NULL + AND t.production_year BETWEEN 1980 AND 2010 + AND n.id = an.person_id + AND n.id = pi.person_id + AND ci.person_id = n.id + AND t.id = ci.movie_id + AND ml.linked_movie_id = t.id + AND lt.id = ml.link_type_id + AND it.id = pi.info_type_id + AND pi.person_id = an.person_id + AND pi.person_id = ci.person_id + AND an.person_id = ci.person_id + AND ci.movie_id = ml.linked_movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/8a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/8a.sql new file mode 100644 index 00000000..4dd5fc0e --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/8a.sql @@ -0,0 +1,25 @@ +SELECT MIN(an1.name) AS actress_pseudonym, + MIN(t.title) AS japanese_movie_dubbed +FROM aka_name AS an1, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n1, + role_type AS rt, + title AS t +WHERE ci.note ='(voice: English version)' + AND cn.country_code ='[jp]' + AND mc.note LIKE '%(Japan)%' + AND mc.note NOT LIKE '%(USA)%' + AND n1.name LIKE '%Yo%' + AND n1.name NOT LIKE '%Yu%' + AND rt.role ='actress' + AND an1.person_id = n1.id + AND n1.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND an1.person_id = ci.person_id + AND ci.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/8b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/8b.sql new file mode 100644 index 00000000..7b51fd1f --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/8b.sql @@ -0,0 +1,30 @@ +SELECT MIN(an.name) AS acress_pseudonym, + MIN(t.title) AS japanese_anime_movie +FROM aka_name AS an, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note ='(voice: English version)' + AND cn.country_code ='[jp]' + AND mc.note LIKE '%(Japan)%' + AND mc.note NOT LIKE '%(USA)%' + AND (mc.note LIKE '%(2006)%' + OR mc.note LIKE '%(2007)%') + AND n.name LIKE '%Yo%' + AND n.name NOT LIKE '%Yu%' + AND rt.role ='actress' + AND t.production_year BETWEEN 2006 AND 2007 + AND (t.title LIKE 'One Piece%' + OR t.title LIKE 'Dragon Ball Z%') + AND an.person_id = n.id + AND n.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND an.person_id = ci.person_id + AND ci.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/8c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/8c.sql new file mode 100644 index 00000000..837cb788 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/8c.sql @@ -0,0 +1,20 @@ +SELECT MIN(a1.name) AS writer_pseudo_name, + MIN(t.title) AS movie_title +FROM aka_name AS a1, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n1, + role_type AS rt, + title AS t +WHERE cn.country_code ='[us]' + AND rt.role ='writer' + AND a1.person_id = n1.id + AND n1.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND a1.person_id = ci.person_id + AND ci.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/8d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/8d.sql new file mode 100644 index 00000000..839ef186 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/8d.sql @@ -0,0 +1,20 @@ +SELECT MIN(an1.name) AS costume_designer_pseudo, + MIN(t.title) AS movie_with_costumes +FROM aka_name AS an1, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n1, + role_type AS rt, + title AS t +WHERE cn.country_code ='[us]' + AND rt.role ='costume designer' + AND an1.person_id = n1.id + AND n1.id = ci.person_id + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND an1.person_id = ci.person_id + AND ci.movie_id = mc.movie_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/9a.sql b/marble/environments/db_env_docker/join-order-benchmark-master/9a.sql new file mode 100644 index 00000000..6d41df82 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/9a.sql @@ -0,0 +1,33 @@ +SELECT MIN(an.name) AS alternative_name, + MIN(chn.name) AS character_name, + MIN(t.title) AS movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND mc.note IS NOT NULL + AND (mc.note LIKE '%(USA)%' + OR mc.note LIKE '%(worldwide)%') + AND n.gender ='f' + AND n.name LIKE '%Ang%' + AND rt.role ='actress' + AND t.production_year BETWEEN 2005 AND 2015 + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND ci.movie_id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND n.id = ci.person_id + AND chn.id = ci.person_role_id + AND an.person_id = n.id + AND an.person_id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/9b.sql b/marble/environments/db_env_docker/join-order-benchmark-master/9b.sql new file mode 100644 index 00000000..792ae70a --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/9b.sql @@ -0,0 +1,31 @@ +SELECT MIN(an.name) AS alternative_name, + MIN(chn.name) AS voiced_character, + MIN(n.name) AS voicing_actress, + MIN(t.title) AS american_movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note = '(voice)' + AND cn.country_code ='[us]' + AND mc.note LIKE '%(200%)%' + AND (mc.note LIKE '%(USA)%' + OR mc.note LIKE '%(worldwide)%') + AND n.gender ='f' + AND n.name LIKE '%Angel%' + AND rt.role ='actress' + AND t.production_year BETWEEN 2007 AND 2010 + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND ci.movie_id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND n.id = ci.person_id + AND chn.id = ci.person_role_id + AND an.person_id = n.id + AND an.person_id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/9c.sql b/marble/environments/db_env_docker/join-order-benchmark-master/9c.sql new file mode 100644 index 00000000..2c2f8cca --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/9c.sql @@ -0,0 +1,30 @@ +SELECT MIN(an.name) AS alternative_name, + MIN(chn.name) AS voiced_character_name, + MIN(n.name) AS voicing_actress, + MIN(t.title) AS american_movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND n.gender ='f' + AND n.name LIKE '%An%' + AND rt.role ='actress' + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND ci.movie_id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND n.id = ci.person_id + AND chn.id = ci.person_role_id + AND an.person_id = n.id + AND an.person_id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/9d.sql b/marble/environments/db_env_docker/join-order-benchmark-master/9d.sql new file mode 100644 index 00000000..99bc63d7 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/9d.sql @@ -0,0 +1,29 @@ +SELECT MIN(an.name) AS alternative_name, + MIN(chn.name) AS voiced_char_name, + MIN(n.name) AS voicing_actress, + MIN(t.title) AS american_movie +FROM aka_name AS an, + char_name AS chn, + cast_info AS ci, + company_name AS cn, + movie_companies AS mc, + name AS n, + role_type AS rt, + title AS t +WHERE ci.note IN ('(voice)', + '(voice: Japanese version)', + '(voice) (uncredited)', + '(voice: English version)') + AND cn.country_code ='[us]' + AND n.gender ='f' + AND rt.role ='actress' + AND ci.movie_id = t.id + AND t.id = mc.movie_id + AND ci.movie_id = mc.movie_id + AND mc.company_id = cn.id + AND ci.role_id = rt.id + AND n.id = ci.person_id + AND chn.id = ci.person_role_id + AND an.person_id = n.id + AND an.person_id = ci.person_id; + diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/README.md b/marble/environments/db_env_docker/join-order-benchmark-master/README.md new file mode 100644 index 00000000..2cf8f104 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/README.md @@ -0,0 +1,74 @@ +# join-order-benchmark + +This package contains the Join Order Benchmark (JOB) queries from: +"How Good Are Query Optimizers, Really?" +by Viktor Leis, Andrey Gubichev, Atans Mirchev, Peter Boncz, Alfons Kemper, Thomas Neumann +PVLDB Volume 9, No. 3, 2015 +[http://www.vldb.org/pvldb/vol9/p204-leis.pdf](http://www.vldb.org/pvldb/vol9/p204-leis.pdf) + +### IMDB Data Set +The CSV files used in the paper, which are from May 2013, can be found +at [http://homepages.cwi.nl/~boncz/job/imdb.tgz](http://homepages.cwi.nl/~boncz/job/imdb.tgz) + +The license and links to the current version IMDB data set can be +found at [http://www.imdb.com/interfaces](http://www.imdb.com/interfaces) + +### Step-by-step instructions +1. download `*gz` files (unpacking not necessary) + + ```sh + wget ftp://ftp.fu-berlin.de/misc/movies/database/frozendata/*gz + ``` + +2. download and unpack `imdbpy` and the `imdbpy2sql.py` script + + ```sh + wget https://bitbucket.org/alberanid/imdbpy/get/5.0.zip + ``` + +3. create openGauss database (e.g., name imdbload): + + ```sh + createdb imdbload + ``` + +4. transform `*gz` files to relational schema (takes a while) + + ```sh + imdbpy2sql.py -d PATH_TO_GZ_FILES -u postgres://username:password@hostname/imdbload + ``` + +Now you should have a openGauss database named `imdbload` with the +imdb data. Note that this database has some secondary indexes (but not +on all foreign key attributes). You can export all tables to CSV: + +```sql +\copy aka_name to 'PATH/aka_name.csv' csv +\copy aka_title to 'PATH/aka_title.csv' csv +\copy cast_info to 'PATH/cast_info.csv' csv +\copy char_name to 'PATH/char_name.csv' csv +\copy comp_cast_type to 'PATH/comp_cast_type.csv' csv +\copy company_name to 'PATH/company_name.csv' csv +\copy company_type to 'PATH/company_type.csv' csv +\copy complete_cast to 'PATH/complete_cast.csv' csv +\copy info_type to 'PATH/info_type.csv' csv +\copy keyword to 'PATH/keyword.csv' csv +\copy kind_type to 'PATH/kind_type.csv' csv +\copy link_type to 'PATH/link_type.csv' csv +\copy movie_companies to 'PATH/movie_companies.csv' csv +\copy movie_info to 'PATH/movie_info.csv' csv +\copy movie_info_idx to 'PATH/movie_info_idx.csv' csv +\copy movie_keyword to 'PATH/movie_keyword.csv' csv +\copy movie_link to 'PATH/movie_link.csv' csv +\copy name to 'PATH/name.csv' csv +\copy person_info to 'PATH/person_info.csv' csv +\copy role_type to 'PATH/role_type.csv' csv +\copy title to 'PATH/title.csv' csv +``` + +To import the CSV files to another database, create all tables (see +`schema.sql` and optionally `fkindexes.sql`) and run the same copy as +above statements but replace the keyword "to" by "from". + +### Questions +Contact Viktor Leis (leis@in.tum.de) if you have any questions. diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/fkindexes.sql b/marble/environments/db_env_docker/join-order-benchmark-master/fkindexes.sql new file mode 100644 index 00000000..23c6ff93 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/fkindexes.sql @@ -0,0 +1,23 @@ +create index company_id_movie_companies on movie_companies(company_id); +create index company_type_id_movie_companies on movie_companies(company_type_id); +create index info_type_id_movie_info_idx on movie_info_idx(info_type_id); +create index info_type_id_movie_info on movie_info(info_type_id); +create index info_type_id_person_info on person_info(info_type_id); +create index keyword_id_movie_keyword on movie_keyword(keyword_id); +create index kind_id_aka_title on aka_title(kind_id); +create index kind_id_title on title(kind_id); +create index linked_movie_id_movie_link on movie_link(linked_movie_id); +create index link_type_id_movie_link on movie_link(link_type_id); +create index movie_id_aka_title on aka_title(movie_id); +create index movie_id_cast_info on cast_info(movie_id); +create index movie_id_complete_cast on complete_cast(movie_id); +create index movie_id_movie_companies on movie_companies(movie_id); +create index movie_id_movie_info_idx on movie_info_idx(movie_id); +create index movie_id_movie_keyword on movie_keyword(movie_id); +create index movie_id_movie_link on movie_link(movie_id); +create index movie_id_movie_info on movie_info(movie_id); +create index person_id_aka_name on aka_name(person_id); +create index person_id_cast_info on cast_info(person_id); +create index person_id_person_info on person_info(person_id); +create index person_role_id_cast_info on cast_info(person_role_id); +create index role_id_cast_info on cast_info(role_id); diff --git a/marble/environments/db_env_docker/join-order-benchmark-master/schema.sql b/marble/environments/db_env_docker/join-order-benchmark-master/schema.sql new file mode 100644 index 00000000..7f691e10 --- /dev/null +++ b/marble/environments/db_env_docker/join-order-benchmark-master/schema.sql @@ -0,0 +1,170 @@ +CREATE TABLE aka_name ( + id integer NOT NULL PRIMARY KEY, + person_id integer NOT NULL, + name text NOT NULL, + imdb_index character varying(12), + name_pcode_cf character varying(5), + name_pcode_nf character varying(5), + surname_pcode character varying(5), + md5sum character varying(32) +); + +CREATE TABLE aka_title ( + id integer NOT NULL PRIMARY KEY, + movie_id integer NOT NULL, + title text NOT NULL, + imdb_index character varying(12), + kind_id integer NOT NULL, + production_year integer, + phonetic_code character varying(5), + episode_of_id integer, + season_nr integer, + episode_nr integer, + note text, + md5sum character varying(32) +); + +CREATE TABLE cast_info ( + id integer NOT NULL PRIMARY KEY, + person_id integer NOT NULL, + movie_id integer NOT NULL, + person_role_id integer, + note text, + nr_order integer, + role_id integer NOT NULL +); + +CREATE TABLE char_name ( + id integer NOT NULL PRIMARY KEY, + name text NOT NULL, + imdb_index character varying(12), + imdb_id integer, + name_pcode_nf character varying(5), + surname_pcode character varying(5), + md5sum character varying(32) +); + +CREATE TABLE comp_cast_type ( + id integer NOT NULL PRIMARY KEY, + kind character varying(32) NOT NULL +); + +CREATE TABLE company_name ( + id integer NOT NULL PRIMARY KEY, + name text NOT NULL, + country_code character varying(255), + imdb_id integer, + name_pcode_nf character varying(5), + name_pcode_sf character varying(5), + md5sum character varying(32) +); + +CREATE TABLE company_type ( + id integer NOT NULL PRIMARY KEY, + kind character varying(32) NOT NULL +); + +CREATE TABLE complete_cast ( + id integer NOT NULL PRIMARY KEY, + movie_id integer, + subject_id integer NOT NULL, + status_id integer NOT NULL +); + +CREATE TABLE info_type ( + id integer NOT NULL PRIMARY KEY, + info character varying(32) NOT NULL +); + +CREATE TABLE keyword ( + id integer NOT NULL PRIMARY KEY, + keyword text NOT NULL, + phonetic_code character varying(5) +); + +CREATE TABLE kind_type ( + id integer NOT NULL PRIMARY KEY, + kind character varying(15) NOT NULL +); + +CREATE TABLE link_type ( + id integer NOT NULL PRIMARY KEY, + link character varying(32) NOT NULL +); + +CREATE TABLE movie_companies ( + id integer NOT NULL PRIMARY KEY, + movie_id integer NOT NULL, + company_id integer NOT NULL, + company_type_id integer NOT NULL, + note text +); + +CREATE TABLE movie_info ( + id integer NOT NULL PRIMARY KEY, + movie_id integer NOT NULL, + info_type_id integer NOT NULL, + info text NOT NULL, + note text +); + +CREATE TABLE movie_info_idx ( + id integer NOT NULL PRIMARY KEY, + movie_id integer NOT NULL, + info_type_id integer NOT NULL, + info text NOT NULL, + note text +); + +CREATE TABLE movie_keyword ( + id integer NOT NULL PRIMARY KEY, + movie_id integer NOT NULL, + keyword_id integer NOT NULL +); + +CREATE TABLE movie_link ( + id integer NOT NULL PRIMARY KEY, + movie_id integer NOT NULL, + linked_movie_id integer NOT NULL, + link_type_id integer NOT NULL +); + +CREATE TABLE name ( + id integer NOT NULL PRIMARY KEY, + name text NOT NULL, + imdb_index character varying(12), + imdb_id integer, + gender character varying(1), + name_pcode_cf character varying(5), + name_pcode_nf character varying(5), + surname_pcode character varying(5), + md5sum character varying(32) +); + +CREATE TABLE person_info ( + id integer NOT NULL PRIMARY KEY, + person_id integer NOT NULL, + info_type_id integer NOT NULL, + info text NOT NULL, + note text +); + +CREATE TABLE role_type ( + id integer NOT NULL PRIMARY KEY, + role character varying(32) NOT NULL +); + +CREATE TABLE title ( + id integer NOT NULL PRIMARY KEY, + title text NOT NULL, + imdb_index character varying(12), + kind_id integer NOT NULL, + production_year integer, + imdb_id integer, + phonetic_code character varying(5), + episode_of_id integer, + season_nr integer, + episode_nr integer, + series_years character varying(49), + md5sum character varying(32) +); diff --git a/marble/environments/db_env_docker/test_set.json b/marble/environments/db_env_docker/test_set.json new file mode 100644 index 00000000..7b310861 --- /dev/null +++ b/marble/environments/db_env_docker/test_set.json @@ -0,0 +1,7719 @@ +{ + "0": { + "start_time": "1697296768", + "end_time": "1697296839", + "start_timestamp": "2023-10-14 23:19:28", + "end_timestamp": "2023-10-14 23:20:39", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 53\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 75\n \n # Size of each column (in characters)\n column_size = 46\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, where 53 users simultaneously insert a large amount of data into a database table containing 18 columns and 75 rows, each column being 46 characters long, a database exception may occur due to the high workload.\n", + "desc": "In the Internet of Things (IoT) scenario, imagine a database called 'IoTDataDB' that is specifically designed to store and process data collected from various sensors. This database is used in a smart home environment to record sensor readings such as temperature, humidity, air quality, and motion. The primary table in this database is called 'SensorReadings', which contains detailed information about the sensor readings. Each row in this table represents a single sensor reading, and there are 75 rows of data in total. The table has 18 columns, including sensor ID, reading type, reading value, timestamp, location, and status. These columns can each store up to 46 characters of data. When there are 53 sensors transmitting data simultaneously at a high frequency, the database might face challenges in efficiently processing and storing this large volume of data. The lack of appropriate data partitioning, buffering mechanisms, or indexing in the 'SensorReadings' table could lead to performance issues and anomalies. For example, the database might experience increased write latency, delayed processing of new sensor readings, or even database locking. This could affect the real-time monitoring and automation capabilities of the smart home system.\n" + }, + "1": { + "start_time": "1697296899", + "end_time": "1697296970", + "start_timestamp": "2023-10-14 23:21:39", + "end_timestamp": "2023-10-14 23:22:50", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 53\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 25\n \n # Number of rows to insert\n num_rows = 69\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a scientific research project, data generated by 53 sensors need to be inserted into a database table simultaneously. The database table contains 25 columns and 69 rows, with each column having a size of 70 characters. This action simulates the database exception caused by the simultaneous insertion of large amounts of data.\n", + "desc": "In the Internet of Things (IoT) scenario, let's consider a database used for collecting and analyzing sensor data called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. There is a main table called 'SensorReadings' that stores information about the readings from these sensors. This table contains 69 rows of data, with each row representing a reading from a sensor. There are 25 columns in this table, including sensor ID, reading type, reading value, timestamp, sensor location, and status information, each column having a size of 70 characters. Now, suppose 53 sensors start transmitting data simultaneously. Due to the lack of proper buffering mechanisms, inadequate data partitioning, or improper indexing in the 'SensorReadings' table, the database might struggle to handle these numerous concurrent write requests efficiently. This can lead to increased write latency, performance issues, and even anomalies in the database.\n" + }, + "2": { + "start_time": "1697297030", + "end_time": "1697297090", + "start_timestamp": "2023-10-14 23:23:50", + "end_timestamp": "2023-10-14 23:24:50", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 191\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 78\n \n # Number of rows to insert\n num_rows = 305\n \n # Size of each column (in characters)\n column_size = 90\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of a large online store, 191 users simultaneously compete to perform frequent update operations on a database table containing 78 columns and 305 rows of product records, where each column has a size of 90 characters. This process causes contention for locking the database table and can result in a database exception.\n", + "desc": "In an IoT (Internet of Things) scenario, there is a database called 'IoTDataDB' which is used for storing and analyzing sensor data. This database contains a main table called 'SensorReadings' where data from various sensors is stored. Each row in the table represents a reading from a sensor and contains information such as sensor ID, reading type, reading value, timestamp, and sensor location. In this specific case, there are 305 rows of data in the 'SensorReadings' table, with 78 columns in each row. The columns contain information of up to 90 characters, including details about the sensor, the reading, and additional metadata. During a high usage period, there are 191 tasks running concurrently that try to access and update the data in the 'SensorReadings' table. This high level of concurrency could lead to contention while accessing the database and may result in locking issues, where multiple tasks compete for access to the same data. These locking issues can affect the performance and efficiency of the database, potentially causing delays or failures in processing sensor data and affecting the overall functioning of the IoT system.\n" + }, + "3": { + "start_time": "1697297150", + "end_time": "1697297328", + "start_timestamp": "2023-10-14 23:25:50", + "end_timestamp": "2023-10-14 23:28:48", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 176\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 3167807\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online marketplace, 176 users perform simultaneous searches on a database table containing 18 columns and 3,167,807 rows of product records. Each column has a size of 70 characters. The searches are conducted after a large-scale data cleaning operation, which causes a database exception due to increased workload and decreased performance.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used for collecting and analyzing sensor data in a smart city infrastructure. This database is named 'SmartCityDB' and contains a table called 'SensorReadings' which stores information from various types of sensors across the city. Each row in this table represents a specific sensor reading and includes fields such as sensor ID, reading type (such as temperature, humidity, air quality), reading value, timestamp, location, and sensor status. Suppose that the database administrator needs to perform a vacuum operation on the 'SensorReadings' table. This operation is needed to reclaim unused space and optimize the performance of the database. In this specific case, the 'SensorReadings' table has a total of 3,167,807 rows, with each row containing 18 columns, each of which can store information up to 70 characters in length. Furthermore, during the vacuum operation, 176 threads are utilized to perform the process concurrently, enhancing the efficiency of the vacuuming process. The vacuum operation helps to remove dead tuples, update statistics, and organize data pages in a more efficient way, resulting in improved database performance and reduced storage usage. By performing vacuum, the database can effectively manage the growing sensor data and ensure smooth data retrieval and analysis processes in the smart city infrastructure.\n" + }, + "4": { + "start_time": "1697297388", + "end_time": "1697297503", + "start_timestamp": "2023-10-14 23:29:48", + "end_timestamp": "2023-10-14 23:31:43", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 57\n \n # Number of rows to insert\n num_rows = 799006\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a digital marketing company, the database contains 57 columns and 799,006 rows of customer information. Each column has a size of 56 characters. Initially, a large number of indexes are created for customer attributes such as name, age, and gender. Then, nine users simultaneously perform queries on the customer data, and the indexes are deleted after the queries are completed. This simulates the additional storage and performance impact caused by redundant indexes.\n", + "desc": "In a business intelligence scenario, an anomaly is triggered by the redundant creation of indexes in a database used for storing and processing financial data of large corporations. This database, named 'CorporateFinanceDB', contains multiple tables, one of which is the 'FinancialRecords' table. It holds 799,006 rows of financial records, with each row representing a unique entry and containing 57 columns, each having a size of 56 characters. These columns include information such as transaction ID, transaction type, amount, date, department, project code, budget code, financial year, and audit status. In this scenario, there are 9 users simultaneously performing complex financial queries on the 'FinancialRecords' table. To optimize the performance of these queries, the database administrator creates multiple indexes before executing the queries and deletes them afterward. However, this frequent creation and deletion of indexes can lead to excessive storage usage and performance overhead. It may also cause database fragmentation, which in turn impacts the efficiency of the decision-making process in the business intelligence environment.\n" + }, + "5": { + "start_time": "1697297563", + "end_time": "1697297653", + "start_timestamp": "2023-10-14 23:32:43", + "end_timestamp": "2023-10-14 23:34:13", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are uploading, downloading, or editing files at the same time, the system is experiencing contention for input/output (I/O) resources. This results in a slowdown in file transfers.\n", + "desc": "In an IoT (Internet of Things) scenario, let's consider a database called 'SensorDataDB', which is used to collect and analyze data from various sensors. This database is designed to handle a large volume of sensor data. There is a key table named 'SensorReadings' within this database, which stores information about readings from multiple sensors. Each row in this table represents a reading from a sensor, and the table includes fields such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. Now, suppose that multiple sensors are transmitting data simultaneously at a high frequency. This can create performance issues in the database, particularly if it doesn't have effective data partitioning, buffering mechanisms, or proper indexing. As a result, the database might struggle to process these numerous concurrent write requests efficiently. This can lead to increased write latency and even database locking, causing anomalies in the database. In this specific scenario, the combination of a large amount of data being inserted and I/O contention due to the high frequency of data transmission from sensors can trigger performance issues and anomalies in the database.\n" + }, + "6": { + "start_time": "1697297714", + "end_time": "1697297774", + "start_timestamp": "2023-10-14 23:35:14", + "end_timestamp": "2023-10-14 23:36:14", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a large-scale data analysis scenario, multiple users simultaneously perform a join operation on a database table containing a large number of columns and rows. This join operation puts a high load on the CPU, causing contention and resulting in poor performance.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'SensorDataDB' that collects and analyzes data from various sensors. This database is designed to handle a large volume of sensor data. In one of the tables named 'SensorReadings', there are records of sensor readings from different types of sensors. These records include information such as sensor ID, reading type, reading value, timestamp, sensor location, and status. Suppose there is a performance issue with joining tables in this database. It means that when performing queries that involve joining the 'SensorReadings' table with other tables, such as 'SensorInformation' or 'LocationData', the database experiences poor performance.The poor join performance could be due to various factors such as lack of proper indexing on join columns, inefficient query execution plans, or large data volumes in the joined tables. This can result in slow query execution, high CPU consumption, and overall decreased database performance.\n" + }, + "7": { + "start_time": "1697297834", + "end_time": "1697297983", + "start_timestamp": "2023-10-14 23:37:14", + "end_timestamp": "2023-10-14 23:39:43", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online store's database, when fetching a large amount of data from the inventory, the script executes subqueries that have a correlation with the main query. If these correlated subqueries are not optimized, the performance of the query may degrade.\n", + "desc": "In an IoT scenario, suppose there is a database named 'IoTDataDB', which is responsible for storing and processing data collected from various sensors. This database contains a table called 'SensorReadings', which records information about sensor readings. For each reading, the table contains the sensor ID, reading type, reading value, timestamp, sensor location, and other relevant details. In this particular scenario, the database encounters issues when fetching large amounts of data that involve correlated subqueries. For example, there might be a requirement to retrieve all sensor readings for a specific location and calculate the average reading value for each sensor. To achieve this, the database needs to perform subqueries to filter the relevant sensor readings and then calculate the average value for each sensor. However, when dealing with a large number of readings and sensors, executing multiple subqueries can become inefficient and impact the overall performance of the database. The database might need to read a significant amount of data from disk, leading to I/O bottlenecks and slower query execution times.\n" + }, + "8": { + "start_time": "1697298043", + "end_time": "1697298115", + "start_timestamp": "2023-10-14 23:40:43", + "end_timestamp": "2023-10-14 23:41:55", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 117\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 11\n \n # Number of rows to insert\n num_rows = 55\n \n # Size of each column (in characters)\n column_size = 33\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a scientific research facility, 117 experiments are being conducted simultaneously. Each experiment generates a large amount of data with 11 parameters, each consisting of 33 characters. The data from these experiments needs to be inserted into the database, which may result in a database exception due to the high volume of data being inserted concurrently.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'IoTDeviceDB', which is used to store sensor data collected from various devices. This database includes a primary table named 'SensorData' that records detailed sensor readings. Each row in this table represents a reading from a specific sensor, with a total of 11 columns storing information such as sensor ID, reading type, reading value, timestamp, location, and status. Now, imagine a situation where 117 sensors are simultaneously sending data to the database at a high frequency. Due to the sheer volume of data being inserted and the lack of proper optimization techniques, the database might experience performance issues. Without efficient data buffering mechanisms or proper indexing, the database's ability to handle these numerous concurrent insert operations might be strained. This can result in increased latency in the database and potentially lead to anomalies in the system. These anomalies could affect the overall performance of the IoT platform and hinder the accurate and timely processing of incoming sensor data.\n" + }, + "9": { + "start_time": "1697298175", + "end_time": "1697298246", + "start_timestamp": "2023-10-14 23:42:55", + "end_timestamp": "2023-10-14 23:44:06", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 117\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 26\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, 117 sensors generate a large amount of data that needs to be inserted into the database simultaneously. This process involves inserting data into a table containing 26 columns and 68 rows, with each column having a size of 78 characters. The purpose is to simulate the database exception that can occur due to the insertion of such a large volume of data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database dedicated to collecting and analyzing data from various sensors, such as temperature, humidity, pressure, light, and motion sensors. This database, called 'SensorDataDB', is designed to handle large volumes of sensor data. One of the main tables in this database is 'SensorReadings', which stores detailed information about the readings from these sensors. This table consists of 68 rows, each representing a reading record for a specific sensor, with a total of 26 columns. These columns store information such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. When 117 sensors simultaneously transmit data at a high frequency, it can put a strain on the database's performance. This can be due to factors such as insufficient data partitioning, lack of buffering mechanisms, or inefficient indexing. As a result, the database may experience increased latency in processing the large volume of incoming data, leading to anomalies in the system. These anomalies can affect the timeliness and accuracy of data analysis and may even cause temporary disruptions in sensor data collection and processing.\n" + }, + "10": { + "start_time": "1697298306", + "end_time": "1697298367", + "start_timestamp": "2023-10-14 23:45:06", + "end_timestamp": "2023-10-14 23:46:07", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 55\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 97\n \n # Number of rows to insert\n num_rows = 255\n \n # Size of each column (in characters)\n column_size = 60\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a real-time online ticket booking system, 55 customers are trying to book tickets simultaneously. This operation triggers a lock contention, where multiple users compete to lock the database table and perform the update operation. The database table contains 97 columns, 255 rows, and each column has a size of 60 characters.\n", + "desc": "In this scenario, a database named 'BusinessDB' is utilized in a business environment to store and process various business data. Within this database, there is a key table called 'TransactionRecords' that records detailed information about business transactions. This table consists of 255 rows of data, with each row representing a transaction record, and it contains a total of 97 columns, with each column capable of storing information up to 60 characters long. These columns may include transaction ID, customer ID, product ID, transaction type (such as sales, refunds, purchases), transaction amount, date and time, payment method, employee ID, transaction status, and other relevant details. In this scenario, there are 55 users concurrently performing frequent update operations on the 'TransactionRecords' table, such as modifying transaction statuses, updating transaction amounts, or adding transaction notes. Due to the high concurrency and the database's locking mechanism, these concurrent update operations could result in a contention for locking the database table. Such contention, if it persists for a prolonged period, could lead to performance issues within the database. During peak business hours, this prolonged locking might cause delays or failures in processing other users' transaction requests, thereby impacting the daily business operations. Moreover, if such incidents occur frequently, they could also cause rapid growth in the database transaction log, consuming excessive storage space, and potentially causing temporary interruptions in database services.\n" + }, + "11": { + "start_time": "1697298427", + "end_time": "1697298498", + "start_timestamp": "2023-10-14 23:47:07", + "end_timestamp": "2023-10-14 23:48:18", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 177\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 11\n \n # Number of rows to insert\n num_rows = 3025995\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 177 users are simultaneously searching a database table that contains 11 columns, 3,025,995 rows, and each column size is 78 characters. The search operation is followed by a vacuum operation on the database table, which aims to optimize the table's performance and storage space utilization. This scenario simulates the potential exception that could occur during this process.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a database designed to store and analyze sensor data named 'SensorDataDB'. This database is used to handle a vast amount of data from various types of sensors. Within this database, there is a key table called 'SensorReadings' which contains 3,025,995 rows of data, each representing a reading from a sensor. The 'SensorReadings' table has 11 columns including sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, 177 sensors are transmitting data simultaneously at a high frequency. The lack of adequate optimization measures such as vacuuming or proper data cleanup can impact the database's performance. Without implementing such measures, the database may become prone to anomalies as a result of the large volume of data and the inability to efficiently process these concurrent write requests. These anomalies could ultimately affect the overall performance and functionality of the IoT system.\n" + }, + "12": { + "start_time": "1697298558", + "end_time": "1697298672", + "start_timestamp": "2023-10-14 23:49:18", + "end_timestamp": "2023-10-14 23:51:12", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 87\n \n # Number of rows to insert\n num_rows = 831757\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial database with 87 columns and 831,757 rows, each with a column size of 56 characters, a large number of unnecessary indexes are created at the beginning of the query operation for various financial information such as transaction date, account holder, and transaction amount. This can result in additional storage consumption and performance overhead. The query is performed by 5 users simultaneously.\n", + "desc": "In a business intelligence scenario, particularly involving financial data analysis and reporting, there is a database called 'CorporateFinanceDB' that specializes in storing and processing the financial data of large corporations. This database contains multiple tables, with one important table named 'FinancialRecords' that records various financial transactions and statement information for the company. The 'FinancialRecords' table consists of 831,757 rows of data, where each row represents a financial record. It contains a total of 87 columns, each column capable of storing information up to 56 characters in length. These columns include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and more. In a typical business intelligence analysis process, to efficiently respond to complex query demands such as budget analysis, income reports, or audits, the database administrator might create many indexes before running these queries. These indexes could be based on transaction type, date range, department, or project code. However, if a query involves retrieving data from a large table like 'FinancialRecords', particularly when there are redundant indexes in place, it could lead to redundant index anomalies. These anomalies could manifest as additional storage usage, increased processing overhead, or even fragmented indexes. Overall, this could result in delayed report generation, thus affecting the efficiency of the decision-making process within the business intelligence environment.\n" + }, + "13": { + "start_time": "1697298732", + "end_time": "1697298823", + "start_timestamp": "2023-10-14 23:52:12", + "end_timestamp": "2023-10-14 23:53:43", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, there is contention for input/output (I/O) operations. This causes a slowdown in file transfers. By running the command \"python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION\", the script simulates a scenario where multiple users are uploading, downloading, or editing files simultaneously, creating I/O contention.\n", + "desc": "In an online file sharing system called 'TeamFileShareDB', multiple users are simultaneously uploading, downloading, and editing files. This system not only stores the files themselves but also records metadata such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. The system is commonly used for sharing large files among team members, such as presentations, video conference recordings, or design drawings. However, due to the high concurrency in file operations, the 'TeamFileShareDB' database experiences challenges related to I/O (input/output) contention. When multiple users attempt to upload or download large files at the same time, the system's storage and network bandwidth become strained. This I/O contention leads to slower file transfer speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Additionally, the frequent write operations in the database, such as file uploads and metadata updates, can impact the overall database performance. During peak periods, the database may encounter issues with locking and transaction management, further slowing down file processing and the recording of metadata.\n" + }, + "14": { + "start_time": "1697298883", + "end_time": "1697298943", + "start_timestamp": "2023-10-14 23:54:43", + "end_timestamp": "2023-10-14 23:55:43", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis task, the join operation between two large tables is performed, but the performance is poor due to inefficient join algorithms. Additionally, the CPU is heavily burdened by other processes running simultaneously, resulting in contention and further slowing down the join operation.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'IoTDataDB' that collects and stores large volumes of sensor data. This database is used to analyze and monitor various IoT devices in real-time. One of the key tables in this database is called 'SensorReadings', which contains data from different sensors such as temperature sensors, humidity sensors, and motion sensors. Each row in this table represents a reading from a specific sensor, and it includes information such as sensor ID, sensor type, reading value, timestamp, and location.Now, imagine that multiple users are trying to analyze the data from the 'SensorReadings' table by performing complex queries that involve joining this table with other tables in the database. However, due to poor join performance, these queries are taking a long time to execute, leading to delays in data analysis and real-time monitoring. The poor join performance may be a result of inefficient indexing, improper query optimization, or a large number of rows in the 'SensorReadings' table.Additionally, in this scenario, there is also CPU contention. This means that there are multiple processes or threads competing for the available CPU resources. As a result, the CPU may become overloaded, causing delays and performance issues in executing queries and processing data.Overall, the combination of poor join performance and CPU contention can lead to significant delays in data analysis and real-time monitoring in an IoT environment. This can impact critical decision-making processes, operational efficiency, and the overall performance of the IoT system.\n" + }, + "15": { + "start_time": "1697299003", + "end_time": "1697299153", + "start_timestamp": "2023-10-14 23:56:43", + "end_timestamp": "2023-10-14 23:59:13", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In the database of an e-commerce platform, when querying the inventory for a large number of products, there is a possibility of using correlated subqueries, which can lead to a deterioration in performance. This can cause a delay in retrieving large amounts of data.\n", + "desc": "In an e-commerce scenario, imagine a database called 'ECommerceDB' that stores information about various products. One important table in this database is 'ProductInventory', which records inventory details for thousands or even hundreds of thousands of products. This table contains information such as product ID, stock level, last inventory update time, supplier ID, and warehouse location. As part of the e-commerce platform, there may be a need to query the inventory level of products, especially for specific categories. To achieve this, the platform may execute correlated subqueries. For example, when querying the total inventory for a specific category, the platform would select all products within that category from the 'ProductDetails' table and then execute subqueries on the 'ProductInventory' table to obtain the inventory data for each product. However, if the category has a large number of products, executing individual subqueries for each product can result in poor performance. This is because the database might need to read a significant amount of data from the disk, leading to I/O bottlenecks and slower query execution times.\n" + }, + "16": { + "start_time": "1697299213", + "end_time": "1697299285", + "start_timestamp": "2023-10-15 00:00:13", + "end_timestamp": "2023-10-15 00:01:25", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 152\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 64\n \n # Size of each column (in characters)\n column_size = 57\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a sensor data collection system, 152 sensors are generating a large amount of data that needs to be inserted into the database. Each data entry has 12 columns, with each column having a size of 57 characters. The system will simulate the database exception caused by this data insertion process.\n", + "desc": "In the context of an Internet of Things (IoT) environment, imagine a database designed to handle large volumes of data from various types of sensors. This database is referred to as 'SensorDataDB' and is responsible for storing and analyzing sensor data. One of the key tables in this database is named 'SensorReadings' and it contains information from 64 sensors, each with 12 columns. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information, with each column capable of storing up to 57 characters. In this scenario, the script is aiming to simulate a situation where 152 sensors simultaneously transmit data to the database at a high frequency. This influx of data can put significant strain on the database's ability to handle the concurrent write requests efficiently. If the database lacks proper data partitioning, buffering mechanisms, or appropriate indexing, it may result in performance issues, such as increased write latency and even potential database locking. These issues can lead to anomalies in the database's behavior and overall functionality, handicapping its ability to process the large-scale data insertion effectively in a timely manner.\n" + }, + "17": { + "start_time": "1697299345", + "end_time": "1697299417", + "start_timestamp": "2023-10-15 00:02:25", + "end_timestamp": "2023-10-15 00:03:37", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 152\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 34\n \n # Number of rows to insert\n num_rows = 86\n \n # Size of each column (in characters)\n column_size = 65\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an internet of things (IoT) system, 152 sensors generate a large amount of data that needs to be inserted into a database simultaneously. Each data entry consists of 34 columns, with each column having a size of 65 characters. There are a total of 86 data entries. This simulation aims to trigger a database exception due to the overwhelming amount of data being inserted at once.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database dedicated to storing sensor data called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which stores information about sensor readings. The table contains 86 rows of data, each representing a reading from a sensor. It has 34 columns, each capable of storing information up to 65 characters long. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this specific scenario, 152 sensors simultaneously start transmitting data at a high frequency. However, due to the lack of effective data partitioning or buffering mechanisms in the database, these large-scale insertions of sensor data may cause performance issues. The database may experience increased write latency and potentially encounter database locking, leading to anomalies. This can impact the real-time monitoring and analysis of sensor data, and also affect other operations relying on the database.\n" + }, + "18": { + "start_time": "1697299477", + "end_time": "1697299537", + "start_timestamp": "2023-10-15 00:04:37", + "end_timestamp": "2023-10-15 00:05:37", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 156\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 72\n \n # Number of rows to insert\n num_rows = 291\n \n # Size of each column (in characters)\n column_size = 81\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online store, there is a situation where 156 users simultaneously try to perform frequent update operations. These operations involve a database table with 72 columns and 291 rows of records related to products. Each column is 81 characters in size. The users compete to lock the database table and perform the update operations, which might result in database contention and potential exceptions.\n", + "desc": "In an Internet of Things (IoT) scenario, let's imagine a database called 'SensorDataDB' that stores data collected from various sensors. This database is designed to handle a large volume of sensor data, such as temperature, humidity, pressure, light, motion, etc. One of the key tables in this database is named 'SensorReadings' and contains information about sensor readings from 291 different sensors. Each row in this table represents a reading for a specific sensor, and there are 72 columns, each capable of storing information up to 81 characters long. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In a specific scenario, there are 156 users or sensor devices that simultaneously attempt to write data to the 'SensorReadings' table. Due to the high concurrency of write operations, the database may encounter lock contention issues. This means that multiple users or devices are competing for access to the same database table simultaneously, leading to a delay or failure in processing these write operations. This lock contention could result in performance issues, such as delays in data storage or even database locking, which might impact the overall efficiency of the IoT system or disrupt real-time data analysis. Additionally, if such lock contention occurs frequently, it could lead to increased database transaction log size and potentially cause temporary interruptions in the services provided by the IoT system.\n" + }, + "19": { + "start_time": "1697299598", + "end_time": "1697299673", + "start_timestamp": "2023-10-15 00:06:38", + "end_timestamp": "2023-10-15 00:07:53", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 191\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 3771540\n \n # Size of each column (in characters)\n column_size = 65\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the inventory database of a retail store, 191 employees simultaneously perform searches using various search criteria such as product name, category, and price range. The searches are executed on a database table containing 12 columns and 3,771,540 rows, with each column having a size of 65 characters. This simulation represents the potential exception that could occur due to a large number of search queries without proper index usage after a database vacuum operation.\n", + "desc": "In the database of an IoT scenario, suppose there is a database named 'IoTDataDB' that collects and analyzes data from various sensors. This database is designed to handle a large volume of sensor data. One of the key tables in this database is called 'SensorReadings', which stores information about sensor readings from a variety of sensors. This table contains 3,771,540 rows of data, with each row representing a reading from a sensor. The table has 12 columns, each capable of storing up to 65 characters of data. These columns may include sensor ID, sensor type, reading value, timestamp, sensor location, and other relevant details. In this scenario, the database administrator needs to perform a VACUUM operation on the table to reclaim unused storage space and improve performance. However, due to the large scale of the table and the high number of concurrent threads (191), the VACUUM operation might cause performance issues, such as increased I/O contention and extended database lock durations. These anomalies can impact the overall efficiency and functioning of the IoT data analysis system.\n" + }, + "20": { + "start_time": "1697299733", + "end_time": "1697299848", + "start_timestamp": "2023-10-15 00:08:53", + "end_timestamp": "2023-10-15 00:10:48", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 7\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 59\n \n # Number of rows to insert\n num_rows = 819303\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used by an e-commerce platform, 7 users perform a query that involves multiple unnecessary indexes. The database table contains 59 columns and 819,303 rows, with each column containing 99 characters of product information. This simulation aims to demonstrate the additional storage footprint and performance overhead caused by the redundant indexes.\n", + "desc": "In the business intelligence scenario, there is a database named 'BusinessIntelDB' that stores and analyzes financial data for a large corporation. Within this database, there is a key table called 'FinancialTransactions' which records various financial transactions and statement information. The table contains 819,303 rows of data, with each row representing a financial record. There are a total of 59 columns in the table, each containing information up to 99 characters. These columns include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and more.To optimize the performance of complex financial queries, database administrators may create redundant indexes before executing these queries. These indexes could be based on transaction type, date range, department, or project code. However, in some cases, these redundant indexes may lead to inefficiencies and performance issues in the database.Suppose, at a specific moment, 7 users simultaneously perform complex financial queries on the 'FinancialTransactions' table. The database administrator creates multiple redundant indexes before the queries start and then deletes them after the queries are completed. Such frequent creation and deletion of indexes can result in additional storage usage and performance overhead in the database. It can also cause database fragmentation and impact overall performance.In a business intelligence environment, these redundant indexes can lead to delayed report generation, affecting the efficiency of decision-making processes. Additionally, the frequent creation and deletion of indexes can also impact other database operations, such as inserting new financial records or updating existing records.\n" + }, + "21": { + "start_time": "1697299908", + "end_time": "1697299999", + "start_timestamp": "2023-10-15 00:11:48", + "end_timestamp": "2023-10-15 00:13:19", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file-sharing system, multiple users are uploading, downloading, or editing files simultaneously. This creates contention for the I/O operations in the system, leading to slower file transfer speeds.\n", + "desc": "In an IoT scenario, there is a database called 'SensorDataDB' that stores data collected from various sensors. This database is designed to handle a large volume of sensor data. One key table in this database is 'SensorReadings', which contains fields to store data from 100 sensors. These fields include sensor ID, reading type, reading value, timestamp, sensor location, and status information. When all 100 sensors start transmitting data simultaneously at a high frequency, the database may face performance issues, such as I/O contention. This means that the storage and network bandwidth of the system can be strained due to multiple users simultaneously uploading or downloading large amounts of data. This can lead to slower file transfer speeds and impact the database's ability to handle these concurrent write operations effectively. Additionally, the frequent write operations (such as file uploads and metadata updates) can further impact the performance of the database. This I/O contention can result in delays in file processing and slow down the recording of metadata in the database.\n" + }, + "22": { + "start_time": "1697300059", + "end_time": "1697300120", + "start_timestamp": "2023-10-15 00:14:19", + "end_timestamp": "2023-10-15 00:15:20", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a large-scale data analytics system, multiple joins are performed between tables with poor join performance. Additionally, there is contention for CPU resources during the join operation, resulting in degraded performance of the system.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'SmartHomeDB' that stores data about various smart home devices. This database contains a key table named 'DeviceStatus', which records the status and performance information of these devices. The table consists of multiple rows of data, each representing a device, with various columns containing information such as device ID, device type, connectivity status, power usage, temperature, humidity, activity logs, and more. When performing complex queries that involve joining multiple tables, such as retrieving all devices with high power usage and low humidity, the database might encounter poor join performance. This occurs when the SQL query execution plan does not efficiently utilize indexes or when there is a lack of appropriate join conditions. These issues could lead to slower query execution times, increased CPU usage, and potential contention for CPU resources. Consequently, the performance of the entire system might be affected, as the database struggles to handle concurrent join-based queries.\n" + }, + "23": { + "start_time": "1697300180", + "end_time": "1697300330", + "start_timestamp": "2023-10-15 00:16:20", + "end_timestamp": "2023-10-15 00:18:50", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, when trying to fetch a large amount of data, particularly when using correlated subqueries, there is a potential for the performance of the query to degrade. This means that the system may take longer to provide the requested data if the subqueries are not optimized properly.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a system that collects and analyzes data from various sensors, such as temperature, humidity, or motion sensors, for monitoring and controlling greenhouse conditions for plant growth. This system includes a database called 'GreenhouseDataDB' that stores sensor data. One of the key tables in this database is 'SensorReadings', which records information about different types of sensor readings in the greenhouse. This table contains a large number of rows, each representing a sensor reading, and multiple columns to store sensor data, such as sensor ID, type, reading value, timestamp, and location. In this scenario, fetching large amounts of sensor data and performing related subqueries is a common operation. For example, the system might need to retrieve all temperature readings from a specific sensor within a certain date range and calculate the average temperature. However, due to the large volume of data and the need to perform multiple subqueries for each sensor reading, the database might encounter challenges in efficiently processing these queries. This can result in slow query execution times and potentially impact the real-time monitoring and control of the greenhouse environment.\n" + }, + "24": { + "start_time": "1697300390", + "end_time": "1697300461", + "start_timestamp": "2023-10-15 00:19:50", + "end_timestamp": "2023-10-15 00:21:01", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 109\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 66\n \n # Size of each column (in characters)\n column_size = 25\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data management system where multiple users are simultaneously inserting a large amount of data, 109 threads are used to simulate the insertion process. The data being inserted consists of 16 columns with each column containing a maximum of 25 characters. There are a total of 66 rows being inserted. This scenario helps identify any potential exceptions or issues that may arise during this process.\n", + "desc": "In the context of an Internet of Things (IoT) system, there is a database named 'SensorDataDB' that is used to collect and analyze sensor data. The database stores data from various types of sensors and has a primary table called 'SensorReadings'. This table contains 66 rows of data, with each row representing a sensor reading, and consists of 16 columns, each column having a size of up to 25 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, there is a need to insert a large amount of data into the 'SensorReadings' table. To simulate this, the script is triggered with the 'INSERT_LARGE_DATA' anomaly, using 109 threads to perform concurrent insert operations. This can cause performance issues in the database, such as increased write latency and potential database locking, especially if the database is not properly optimized to handle such a high number of concurrent write requests.\n" + }, + "25": { + "start_time": "1697300521", + "end_time": "1697300593", + "start_timestamp": "2023-10-15 00:22:01", + "end_timestamp": "2023-10-15 00:23:13", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 109\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 32\n \n # Number of rows to insert\n num_rows = 94\n \n # Size of each column (in characters)\n column_size = 83\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, a large amount of data generated by 109 sensors needs to be inserted into the database simultaneously. Each sensor generates data with 32 columns of information, each column containing 83 characters. The database table contains 94 rows to store this data. Running this script simulates the database exception caused by the insertion process.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'IoTDataDB' that is responsible for collecting and storing data from various sensors. This database is designed to handle a large volume of data and perform real-time analysis. In this scenario, there is a table named 'SensorData' where data from 94 sensors is stored. Each row in the table represents a reading from a specific sensor, and there are 32 columns to store various information such as sensor ID, sensor type, reading value, timestamp, location, and status. The size of each column is limited to 83 characters. When all 109 sensors start transmitting data simultaneously, the database might face performance challenges due to the high volume of incoming data. The lack of proper data partitioning or indexing strategies can cause slow write operations and increased latency in storing the sensor readings. This could result in delayed processing or even loss of incoming sensor data, leading to anomalies in the database. These anomalies can affect the accuracy and efficiency of real-time data analysis, impacting the overall functionality of the IoT system.\n" + }, + "26": { + "start_time": "1697300653", + "end_time": "1697300713", + "start_timestamp": "2023-10-15 00:24:13", + "end_timestamp": "2023-10-15 00:25:13", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 176\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 63\n \n # Number of rows to insert\n num_rows = 307\n \n # Size of each column (in characters)\n column_size = 96\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online marketplace, 176 users are trying to perform simultaneous update operations on a product table that contains 63 columns and 307 rows. Each column has a size of 96 characters. Due to the high number of concurrent requests and the resulting lock contention, there might be a delay or exception in the database.\n", + "desc": "In the context of the Internet of Things (IoT), there is a database used for collecting and processing sensor data from various devices. This database, called 'SensorDataDB', contains a key table named 'SensorReadings' that records detailed information about sensor readings. Within this table, there are 307 rows of data, each representing a specific sensor reading. The table consists of 63 columns, each containing information up to 96 characters long. These columns may include the sensor ID, reading type, reading value, timestamp, sensor location, and status information, among others.In a typical scenario, multiple IoT devices continuously transmit data to the 'SensorReadings' table, causing high concurrency in write operations. Specifically, in this particular instance, 176 devices simultaneously attempt write operations, which can lead to contention issues due to the database's locking mechanism. As a result, there may be delays in processing other users' requests or possible failures. This contention can also have an impact on the performance and operations of the entire IoT system.It is worth noting that the above information is a randomly generated scenario, and specific details may vary in real-life situations.\n" + }, + "27": { + "start_time": "1697300773", + "end_time": "1697300847", + "start_timestamp": "2023-10-15 00:26:13", + "end_timestamp": "2023-10-15 00:27:27", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 136\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 2211862\n \n # Size of each column (in characters)\n column_size = 83\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online store's database with a table containing 16 columns and 2,211,862 rows of product records, each with a column size of 83 characters, 136 users simultaneously perform a search after a large-scale data cleaning operation. This simulates the scenario of users searching for products using various filters like product name, category, and price range, after performing a vacuum operation on the database table.\n", + "desc": "In the context of an IoT (Internet of Things) scenario, let's imagine a database called 'SensorDataDB'. This database is used for storing and analyzing sensor data from various IoT devices. It is designed to handle a large volume of data coming from these sensors. One of the main tables in this database is called 'SensorReadings', which contains information about the readings obtained from different sensors. This table consists of 2,211,862 rows, with each row representing a specific sensor reading. There are a total of 16 columns in this table, each column capable of storing up to 83 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and other relevant information. In this specific scenario, the database administrator wants to optimize the performance and efficiency of the database by performing a VACUUM operation. This operation involves reclaiming unused space in the database and optimizing its storage structure. To execute this VACUUM operation, a script is being run with the given parameters, including the anomaly flag 'VACUUM', 136 threads (which means 136 concurrent connections or processes), the number of columns set to 16, each column size set to 83 characters, and the total row count set to 2,211,862.\n" + }, + "28": { + "start_time": "1697300907", + "end_time": "1697301022", + "start_timestamp": "2023-10-15 00:28:27", + "end_timestamp": "2023-10-15 00:30:22", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 76\n \n # Number of rows to insert\n num_rows = 733097\n \n # Size of each column (in characters)\n column_size = 68\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a large online database, where there are 5 users searching for information in a table with 76 columns and 733,097 rows, each column having a size of 68 characters, the database has created redundant indexes for various attributes such as product name, category, and price range. This can lead to unnecessary storage and performance overhead.\n", + "desc": "In a business intelligence scenario, particularly in analyzing financial data and generating reports for large companies, suppose there is a database called 'CorporateFinanceDB'. This database is designed to store and process financial data related to various transactions and statements of the company. Within this database, there is a key table named 'FinancialRecords' that contains 733,097 rows of data, with each row representing a specific financial record. The table consists of 76 columns, each capable of storing information of up to 68 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, or liabilities), transaction amount, date, department, project code, budget code, financial year, and audit status, among others. In a typical business intelligence analysis process, the database administrator may need to handle complex financial queries to generate reports efficiently. To optimize the query performance, the administrator may create redundant indexes before running these queries, which could be based on transaction type, date range, department, or project code. However, the creation of redundant indexes can lead to additional storage usage and performance overhead in the database. Furthermore, such frequent index operations may cause database fragmentation, which further impacts performance. In a business intelligence environment, this could result in delayed report generation, potentially affecting the efficiency of the decision-making process.\n" + }, + "29": { + "start_time": "1697301082", + "end_time": "1697301173", + "start_timestamp": "2023-10-15 00:31:22", + "end_timestamp": "2023-10-15 00:32:53", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are simultaneously uploading, downloading, or editing files, causing competition for input/output operations. This results in slower file transfer speeds.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database system called 'SmartHomeDB' that is designed to handle data from various smart devices in a home environment. This database contains multiple tables, one of which is a key table named 'DeviceData', where data from different devices is stored. This table contains information about the devices, such as device ID, device type (e.g., thermostat, lighting, security system), current status, and various sensor readings (e.g., temperature, humidity, motion). During peak usage periods, multiple devices in the smart home might be simultaneously collecting and reporting data to the 'DeviceData' table at a high frequency. This large volume of incoming data can strain the I/O (input/output) capabilities of the database, particularly the storage and network bandwidth. As a result, file transfer speeds might become slower, and the overall performance of the system might be impacted. Additionally, the frequent write operations in the database, such as recording sensor readings or updating device status, can put additional strain on the database's resources. This I/O contention, coupled with high concurrent write operations, may lead to delayed data processing and increased response times in the smart home system.\n" + }, + "30": { + "start_time": "1697301233", + "end_time": "1697301293", + "start_timestamp": "2023-10-15 00:33:53", + "end_timestamp": "2023-10-15 00:34:53", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In an online ticketing system, multiple users are searching for events based on different criteria such as location, date, and ticket price. However, the join operation between the event table and the location table is not optimized, leading to poor performance. Additionally, there is CPU contention as multiple users are running resource-intensive queries simultaneously, causing the CPU to be overloaded and slowing down the overall system performance.\n", + "desc": "In an IoT scenario, suppose there is a database named 'IoTDataDB' that collects and stores data from various sensors and devices. This database is designed to handle large volumes of data and perform complex queries for IoT analytics. One of the key tables in this database is called 'SensorData', which contains data from different sensors, such as temperature, humidity, pressure, motion, and more. Each row in the table represents a specific sensor reading, and there are millions of rows in total. The table consists of multiple columns, including sensor ID, sensor type, timestamp, value, location, and more. In this scenario, a particular query involves joining the 'SensorData' table with another table named 'SensorLocations', which stores the geographical coordinates of each sensor. The purpose of this join operation is to combine the sensor data with their corresponding location information. However, due to poor indexing or inefficient join algorithms, the performance of this join operation might be affected. This can lead to slow query execution times, high CPU utilization, and contention for computing resources. In an IoT environment with a high frequency of data collection and complex analytics, such poor join performance and CPU contention can considerably impact the efficiency and timeliness of data analysis and decision-making processes.\n" + }, + "31": { + "start_time": "1697301353", + "end_time": "1697301503", + "start_timestamp": "2023-10-15 00:35:53", + "end_timestamp": "2023-10-15 00:38:23", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online store's database, there is a situation where a large amount of data needs to be fetched, specifically the inventory information for each product. This process involves executing correlated subqueries. If these subqueries are not optimized, the performance of retrieving the inventory data may be negatively impacted.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'SensorDataDB'. This database is designed to store and process data from various sensors in a smart home. One of the key tables in this database is 'SensorReadings', which contains information about the readings from 100 sensors. The fields in this table include sensor ID, reading type (e.g., temperature, humidity, pressure), reading value, timestamp, sensor location, and status information. When multiple sensors start transmitting data simultaneously at a high frequency, the database might encounter performance issues. This could be due to factors such as lack of data partitioning, insufficient buffering mechanisms, or improper indexing. As a result, the database's ability to handle these simultaneous write requests can be limited, leading to anomalies. These anomalies can manifest as increased write latency, database locking, and overall degradation in database performance.\n" + }, + "32": { + "start_time": "1697301564", + "end_time": "1697301635", + "start_timestamp": "2023-10-15 00:39:24", + "end_timestamp": "2023-10-15 00:40:35", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 69\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 86\n \n # Size of each column (in characters)\n column_size = 71\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, there is a situation where 69 sensors generate a large amount of data that needs to be simultaneously inserted into a database table. The table contains 16 columns, and each column can hold up to 71 characters. There are 86 rows in the table. This scenario simulates a database exception caused by the process of inserting the large data set.\n", + "desc": "In the internet of things (IoT) scenario, there is a database called 'IoTDataDB' that is used to store data collected from various IoT devices. This database contains a primary table named 'DeviceData' which records information about the devices and their corresponding data. In this table, there are 86 rows of data, each representing a device reading, and there are 16 columns, each containing information up to 71 characters long. These columns may include device ID, sensor type, sensor reading, timestamp, location, battery level, signal strength, and other related attributes. During a specific operation, 69 devices simultaneously transmit data to the database at a high frequency. However, due to the lack of efficient data buffering or proper indexing, the database may encounter performance issues in handling these numerous concurrent write requests. This can result in increased write latency and potentially cause anomalies in the database. These anomalies may lead to delays in processing new device data or failures in storing accurate data, ultimately affecting the overall performance and reliability of the IoT system.\n" + }, + "33": { + "start_time": "1697301695", + "end_time": "1697301766", + "start_timestamp": "2023-10-15 00:41:35", + "end_timestamp": "2023-10-15 00:42:46", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 69\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 22\n \n # Number of rows to insert\n num_rows = 77\n \n # Size of each column (in characters)\n column_size = 71\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, there is a need to insert a large amount of data into a database. This scenario simulates the database exception caused by inserting data from 69 sensors simultaneously. The database table contains 22 columns, with each column having a size of 71 characters. There are 77 rows of data to be inserted.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a database system called 'IoTDataDB' that is designed to handle a large volume of data from various sensors. This database stores sensor readings from devices such as temperature sensors, humidity sensors, pressure sensors, light sensors, and motion sensors. The primary table in the database is called 'SensorReadings', which contains 77 rows of data, each representing a reading from a sensor. This table has 22 columns, each with a size of 71 characters, including sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this particular scenario, there are 69 devices that are simultaneously transmitting data at a high frequency. Due to the high volume of data and the concurrent write operations, the database might encounter performance issues. Without proper optimization measures, such as data partitioning or buffering mechanisms, the database's ability to handle these large-scale write operations could be significantly affected, leading to anomalies in the system.\n" + }, + "34": { + "start_time": "1697301826", + "end_time": "1697301886", + "start_timestamp": "2023-10-15 00:43:46", + "end_timestamp": "2023-10-15 00:44:46", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 67\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 65\n \n # Number of rows to insert\n num_rows = 361\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 67 users are simultaneously performing frequent update operations in a database table containing 65 columns and 361 rows of product records, each with a column size of 99 characters. The users compete with each other to lock the database table to perform the update operations.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine there is a database named 'IoTDataDB' that stores data collected from various IoT devices. This database is designed to handle a large volume of sensor data and has a primary table called 'SensorData', which contains detailed information from these devices. Each row in the table represents a data entry from a specific sensor and contains information such as sensor ID, sensor type, measurement value, timestamp, sensor location, and status. In this specific case, there are 361 rows of data in the 'SensorData' table, with each row representing a data entry from a different sensor. The table has 65 columns, each capable of storing information of up to 99 characters. These columns include sensor ID, type, reading value, time of measurement, location, and status, among others. At a particular moment, there are multiple IoT devices sending sensor data to the database simultaneously, and the system is experiencing high concurrency. Due to the design of the database and its locking mechanism, when 67 users try to update the same or adjacent rows in the 'SensorData' table, they may encounter contention for accessing and locking the table. This contention arises because the database allows only one user at a time to modify or access a specific row or a set of related rows. This prolonged locking due to contention can lead to delayed processing or even failure of other users' attempts to access or modify the data in the 'SensorData' table. Consequently, this can impact the overall efficiency and reliability of the IoT system, potentially causing delays in data processing or inaccurate analysis due to missed or delayed sensor readings.\n" + }, + "35": { + "start_time": "1697301946", + "end_time": "1697301992", + "start_timestamp": "2023-10-15 00:45:46", + "end_timestamp": "2023-10-15 00:46:32", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 56\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 3445648\n \n # Size of each column (in characters)\n column_size = 71\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, if there are 56 users searching in the database table containing 16 columns, 3,445,648 rows, each column size of 71 characters, after a large-scale data cleaning operation, it may cause an exception in the database.\n", + "desc": "In an Internet of Things (IoT) setting, there is a database named 'IoTDataDB' that is specifically used for storing and analyzing sensor data. This database contains a table called 'SensorData', which is responsible for recording data from various types of sensors. The 'SensorData' table consists of 3,445,648 rows, each representing a data entry from a sensor. There are 16 columns in this table, including sensor ID, sensor type, sensor value, timestamp, location, status, and other relevant information. The size of each column is 71 characters. In a particular scenario, 56 devices (sensors) are connected to the IoT platform, and these devices start transmitting data simultaneously at a high rate. However, due to the large volume of data being written to the 'SensorData' table, the database might encounter performance issues. This could be a result of insufficient buffer resources, ineffective data partitioning, or inadequate indexing. As a consequence, the database's ability to handle these concurrent write requests efficiently becomes limited. This can lead to increased latency in data storage and potentially trigger anomalies in the database.\n" + }, + "36": { + "start_time": "1697302052", + "end_time": "1697302167", + "start_timestamp": "2023-10-15 00:47:32", + "end_timestamp": "2023-10-15 00:49:27", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 76\n \n # Number of rows to insert\n num_rows = 589889\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database for an online marketplace, when 9 users concurrently perform a query operation on a database table with 76 columns and 589,889 rows of product records, where each column has a size of 52 characters, a large number of unnecessary indexes are created at the beginning of the query. These indexes cause additional storage usage and performance overhead.\n", + "desc": "In the business intelligence scenario, there is a database called 'BusinessIntelligenceDB' which is used for storing and analyzing data related to a company's operations. This database contains multiple tables, one of which is a key table called 'SalesData' that records sales transactions and relevant information. The 'SalesData' table consists of 589,889 rows of data, each representing a specific sales transaction, with a total of 76 columns, each containing information of up to 52 characters. These columns may include transaction ID, customer ID, product ID, date, quantity, price, discount, region, sales representative, and more.To improve the efficiency of analytical queries on the 'SalesData' table, such as sales trend analysis, regional performance evaluation, or customer segmentation, the database administrator decides to create a large number of indexes. These indexes are based on different combinations of columns, aiming to accelerate the retrieval and aggregation of relevant data.Suppose at a specific moment, nine analysts in the business intelligence team simultaneously execute complex queries on the 'SalesData' table. These queries involve different filtering conditions, sorting requirements, or aggregation operations, and thus require different combinations of indexes. To enhance query performance, the administrator creates multiple indexes before the queries start and then deletes them after the queries are completed.However, the frequent creation and deletion of indexes can lead to additional storage usage and performance overhead in the database. Moreover, the creation and deletion operations might cause fragmentation in the database, further impacting performance. In a business intelligence environment, this could result in increased query response time or delayed report generation, affecting the efficiency of decision-making processes.\n" + }, + "37": { + "start_time": "1697302227", + "end_time": "1697302318", + "start_timestamp": "2023-10-15 00:50:27", + "end_timestamp": "2023-10-15 00:51:58", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are sharing files, if there is a simultaneous upload, download, or editing of files, it creates I/O contention. This contention leads to a slowdown in file transfer and affects the overall performance of the system.\n", + "desc": "In a file sharing system scenario, there is a database named 'TeamFileShareDB', which is used by teams or organizations to share files. This database not only stores the files themselves but also records metadata such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users are simultaneously uploading, downloading, or editing files. For example, a project team collaborates to complete an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the 'TeamFileShareDB' database faces challenges in input/output (I/O) contention. When multiple users simultaneously upload or download large files, it strains the system's storage and network bandwidth. This I/O contention can result in slower file transfer speeds, especially in situations of limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact the database's performance. During peak periods, the database might encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "38": { + "start_time": "1697302378", + "end_time": "1697302438", + "start_timestamp": "2023-10-15 00:52:58", + "end_timestamp": "2023-10-15 00:53:58", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system handling customer orders and product information, there is a scenario where multiple users are performing join queries between large tables. These join queries are not optimized, leading to poor performance. Additionally, there is contention among the users for CPU resources, further affecting the query performance.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database called 'IoTDataDB' that is used to store and analyze data collected from various IoT devices. This database contains a key table called 'DeviceReadings' that records readings from multiple sensors and devices. Each row in the table represents a specific reading and includes information such as device ID, sensor type, reading value, timestamp, location, and status. During peak hours, when numerous devices are transmitting data simultaneously, the database may encounter poor join performance. This is often caused by the lack of appropriate indexes or inefficient query optimization strategies. As a result, join operations that involve multiple tables and require combining data from different sensors or devices may become slow and resource-intensive. This can lead to delays in data analysis and processing, hindering real-time monitoring and decision-making in the IoT ecosystem. Additionally, the increased processing workload during these join operations can potentially cause CPU contention, as the database server struggles to handle multiple concurrent queries and utilize CPU resources efficiently. This contention can negatively impact the overall performance and responsiveness of the IoT data processing system, affecting its ability to handle high-throughput data streams and provide timely insights.\n" + }, + "39": { + "start_time": "1697302498", + "end_time": "1697302648", + "start_timestamp": "2023-10-15 00:54:58", + "end_timestamp": "2023-10-15 00:57:28", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform, when trying to fetch a large amount of data, specifically the inventory of each product, the system uses correlated subqueries to find the necessary information. However, if these subqueries are not optimized, the performance of the inventory searching process may degrade.\n", + "desc": "In the Internet of Things (IoT) scenario, let's imagine a system that collects and analyzes sensor data from various devices. This system maintains a database called 'SensorDataDB', which stores information about sensor readings. The 'SensorReadings' table in this database contains data from multiple sensors, including sensor ID, reading type, reading value, timestamp, location, and status. At a certain moment, there is a high demand for querying specific types of sensor data, such as temperature readings from a particular location or time range. To meet these complex query requirements, the database administrator decides to execute correlated subqueries. These subqueries involve obtaining related data from different tables and using them to filter or aggregate sensor readings. However, due to the large volume of sensor data and the complexity of the correlated subqueries, the database might encounter performance issues during the query processing. This can result in slower response times or delays in providing the required sensor data, ultimately affecting the efficiency and effectiveness of the IoT system.\n" + }, + "40": { + "start_time": "1697302708", + "end_time": "1697302780", + "start_timestamp": "2023-10-15 00:58:28", + "end_timestamp": "2023-10-15 00:59:40", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 100\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 19\n \n # Number of rows to insert\n num_rows = 65\n \n # Size of each column (in characters)\n column_size = 41\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a system where data is being inserted into a database, there is a need to insert a large amount of data. This simulation involves inserting data from 100 sources at the same time. The database table being used has 19 columns and each column can hold up to 41 characters. There are a total of 65 rows in the table. This process aims to identify and trigger any anomalies or exceptions that may occur during the insertion of such large amounts of data.\n", + "desc": "In the Internet of Things (IoT) scenario, let's imagine a specialized database named 'IoTDataDB', which collects and stores data from various IoT devices. This database is designed to handle a large amount of data generated by these devices. Within this database, there is a key table named 'DeviceData', which records detailed information about each device and its corresponding data. Suppose this table contains 65 rows of data, with each row representing a specific device's data, and a total of 19 columns, each containing information up to 41 characters in length. These columns may include device ID, device type, location, timestamp, sensor readings, device status, temperature, humidity, pressure, and other relevant information. In order to simulate a realistic scenario, the script is using the 'INSERT_LARGE_DATA' anomaly trigger, which represents a situation where a large amount of data is simultaneously inserted into the 'DeviceData' table. In this case, 100 threads are used to execute concurrent insertion operations. This can lead to performance issues in the database, such as increased write latency and potential contention on database resources. Such anomalies can occur in real-life IoT scenarios, where a large number of devices generate and transmit data at a high frequency, and the database needs to handle concurrent data insertion operations efficiently.\n" + }, + "41": { + "start_time": "1697302840", + "end_time": "1697302911", + "start_timestamp": "2023-10-15 01:00:40", + "end_timestamp": "2023-10-15 01:01:51", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 100\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 38\n \n # Number of rows to insert\n num_rows = 53\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) application, 100 sensors are generating a large amount of data, which needs to be inserted into a database simultaneously. This process simulates the database exception caused by inserting data from these sensors.\n", + "desc": "In a business intelligence scenario, there is a database called 'BusinessIntelligenceDB' that is used to store and analyze data related to business operations. Within this database, there is a key table called 'BusinessData', which records various types of business data such as sales records, customer information, financial transactions, and more. This table contains 53 rows of data, each representing a specific business record, with a total of 38 columns, each capable of storing up to 84 characters. These columns may include data such as transaction ID, customer ID, transaction date, product ID, quantity, price, revenue, expenses, profit, employee ID, department, and other related attributes.Suppose at a certain point in time, there is a need to insert a large amount of data into the 'BusinessData' table. This could be due to factors such as the addition of new business records, data migration, or the import of data from external sources. The insertion operation involves inserting 100 threads of data concurrently, where each thread inserts multiple rows of data into the table.However, the large-scale insertion of data can potentially cause performance issues in the database if not properly optimized. Due to the high number of concurrent data insertion operations, there might be contention for database resources such as CPU, memory, and disk I/O. This contention can lead to increased latency and delays in the insertion process, ultimately affecting the overall performance of the database. Additionally, if the database schema or indexing is not appropriately designed for efficient insertion, the database might experience anomalies, such as data inconsistencies or failures in the insertion process. These anomalies could impact the accuracy and integrity of the business data stored in the database.\n" + }, + "42": { + "start_time": "1697302971", + "end_time": "1697303031", + "start_timestamp": "2023-10-15 01:02:51", + "end_timestamp": "2023-10-15 01:03:51", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 127\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 69\n \n # Number of rows to insert\n num_rows = 351\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a busy online marketplace, 127 users are simultaneously trying to update product records in a database table that has 69 columns and 351 rows. Each column contains data with a size of 92 characters. Due to the high number of users competing for access to the database table, there is a potential for lock contention and database exceptions may occur.\n", + "desc": "In the IoT scenario, suppose there is a database called 'SensorDataDB', used for collecting and analyzing sensor data. This database has a main table called 'SensorReadings' that stores data from various sensors. Each sensor reading includes information such as sensor ID, type, value, timestamp, location, and status. At a specific moment, 127 sensors start transmitting data simultaneously, resulting in a high volume of write operations to the 'SensorReadings' table. As multiple sensors try to update or add records to the table at the same time, there could be contention for locking the database. Due to the design and locking mechanism of the database, this can lead to performance issues, such as delayed processing or even failure of the write operations. In an IoT environment, prolonged locking due to such contention can impact the real-time processing of sensor data, affecting the overall efficiency of the system.\n" + }, + "43": { + "start_time": "1697303092", + "end_time": "1697303148", + "start_timestamp": "2023-10-15 01:04:52", + "end_timestamp": "2023-10-15 01:05:48", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 180\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 19\n \n # Number of rows to insert\n num_rows = 2286854\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a mobile banking application, 180 users are simultaneously searching for transaction information in a database table that contains 19 columns, 2,286,854 rows, and each column with a size of 84 characters. This process triggers a simulated database exception due to the lack of optimization after performing a large-scale data cleaning operation.\n", + "desc": "In an e-commerce platform, there is a database called 'E-CommerceDB' that stores information about various products. One of the key tables in this database is 'ProductDetails', which contains detailed information about the products. This table consists of 2,286,854 rows of data, with 19 columns, each containing up to 84 characters of information. These columns include product ID, name, price, quantity, brand, category, size, color, weight, rating, reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, and description. In this scenario, the database administrator needs to perform a VACUUM operation, which is a process of reclaiming unused space and optimizing database performance. Since the database contains a large number of rows and columns, this VACUUM operation might take a considerable amount of time and resources. If not properly executed, it could lead to anomalies in the database, impacting its performance and affecting the overall functioning of the e-commerce platform.\n" + }, + "44": { + "start_time": "1697303208", + "end_time": "1697303323", + "start_timestamp": "2023-10-15 01:06:48", + "end_timestamp": "2023-10-15 01:08:43", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 89\n \n # Number of rows to insert\n num_rows = 635369\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online store with a large database containing 89 columns and 635,369 rows, each column having a size of 92 characters, the script simulates the creation of redundant indexes on various attributes such as product name, category, and price range. After the indexes are created, 9 users perform simultaneous queries on the database. The script measures the additional storage requirements and performance overhead caused by the redundant indexes.\n", + "desc": "In the business intelligence scenario, let's imagine a database called 'BusinessAnalyticsDB', which is used for analyzing and generating reports on various business data. This database consists of multiple tables, one of which is a vital table named 'SalesData'. The 'SalesData' table contains detailed information about sales transactions, such as sales ID, customer ID, product ID, quantity, price, transaction date, sales region, and other related fields. This table holds a large volume of data, with 635,369 rows, and each row represents a sales transaction record. Moreover, there are 89 columns in the table, with each column accommodating information of up to 92 characters. These columns may include information on product category, brand, salesperson, payment methods, promotional codes, order status, and more.\n" + }, + "45": { + "start_time": "1697303383", + "end_time": "1697303474", + "start_timestamp": "2023-10-15 01:09:43", + "end_timestamp": "2023-10-15 01:11:14", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are simultaneously uploading, downloading, or editing files, there is intense competition for input/output (I/O) resources. This leads to slowed down file transfers and overall system performance.\n", + "desc": "In the context of this business intelligence database, the performance of analytical queries and report generation tasks is crucial. In order to speed up the processing of such queries, the database administrator may consider creating additional indexes on certain columns or combinations of columns. However, if indexes are created excessively or without careful consideration, redundant index creation might occur. This means that multiple indexes could be created on the same or similar columns within the 'SalesData' table. These redundant indexes can impact database performance in several ways. Firstly, they increase storage usage, as each index occupies additional disk space. Secondly, redundant indexes require additional maintenance overhead, such as update operations and index fragmentation management, which can slow down overall database performance. Lastly, excessive indexes might lead to query plan confusion, where the database optimizer may struggle to select the most efficient query execution plan due to the presence of multiple similar indexes. Consequently, these redundant indexes could contribute to performance degradation, longer query execution times, and inefficient resource utilization in the 'BusinessAnalyticsDB'.\n" + }, + "46": { + "start_time": "1697303534", + "end_time": "1697303594", + "start_timestamp": "2023-10-15 01:12:14", + "end_timestamp": "2023-10-15 01:13:14", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics system, multiple users are performing join operations on large datasets. The join performance is poor due to suboptimal join algorithms or inefficient use of indexes. Additionally, there is CPU contention as multiple users are simultaneously executing resource-intensive queries, which leads to slower query execution times.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. One specific table in the database is called 'SensorReadings' and it contains information such as sensor ID, reading type, reading value, timestamp, sensor location, and status. When multiple sensors transmit data simultaneously at a high frequency, the database might experience performance issues, especially if there is a lack of effective data partitioning, insufficient buffering mechanisms, or improper indexing. This can lead to increased write latency, database locking, and anomalies in the database.In an IoT scenario, there is a database specifically designed for storing sensor data from various IoT devices, such as temperature sensors, motion detectors, and humidity sensors. This database, named 'IoTDataDB', contains a primary table called 'SensorData', which records data from these sensors. Each row in the table represents a data point from a specific sensor, including the sensor ID, timestamp, sensor type, and sensor value. In this scenario, due to the high volume of incoming sensor data and the need for real-time analysis, the database administrator wants to perform join operations on the 'SensorData' table to combine data from multiple sensors and generate meaningful insights. However, due to poor join performance, these join operations are not efficient and take a significant amount of time to execute.This poor join performance can be attributed to various factors, such as inefficient join algorithms, lack of appropriate indexes on the join columns, or insufficient memory resources available for performing the join. As a result, the database struggles to process these join operations in a timely manner, leading to delays in data analysis and a potential bottleneck in the overall system performance.Furthermore, in this IoT environment, where data is continuously streaming in from multiple sensors, the database server might also face CPU contention. Multiple processes or threads might be competing for the limited CPU resources, resulting in high CPU usage and potential bottlenecks.These performance issues not only affect the real-time analysis of sensor data but can also impact the overall performance and scalability of the IoT system. Efficient join operations and proper resource allocation are crucial for ensuring timely data processing and analysis in an IoT environment.\n" + }, + "47": { + "start_time": "1697303655", + "end_time": "1697303804", + "start_timestamp": "2023-10-15 01:14:15", + "end_timestamp": "2023-10-15 01:16:44", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online shopping platform's database, when trying to retrieve a large amount of data and perform related subqueries to determine the inventory for each product, the execution of these subqueries may not be optimized, leading to slower performance in querying the inventory.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data. This database contains a table called 'SensorDataDB', which stores information from various sensors. Each row in the table represents a sensor reading and includes fields such as sensor ID, reading type, value, timestamp, location, and status. When conducting queries on this table, there can be instances where a large amount of data needs to be fetched, such as retrieving all sensor readings within a specific time range or for a certain location. In these cases, the queries might involve correlated subqueries to retrieve additional information related to the sensor readings. This could result in slower query performance and potentially consume a significant amount of disk I/O resources, thereby affecting the overall efficiency of the database and the data analysis process.\n" + }, + "48": { + "start_time": "1697303864", + "end_time": "1697303935", + "start_timestamp": "2023-10-15 01:17:44", + "end_timestamp": "2023-10-15 01:18:55", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 98\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 66\n \n # Size of each column (in characters)\n column_size = 48\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a real-life scenario, 98 sensors are generating a large amount of data that needs to be inserted into a database simultaneously. The database table has 16 columns, each with a size of 48 characters, and there are 66 rows of data to be inserted. By simulating this process, we can observe and analyze any exceptions or issues that might arise due to this high-volume data insertion.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically used for collecting and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from sensors. These fields may include sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this scenario, there is a need to insert a large amount of data into the 'SensorReadings' table. The script specifies that 66 rows of data should be inserted, with each row containing 16 columns, each column having a size of 48 characters. The script also indicates that the insertion should be performed using 98 threads, which means that the data insertion operation will be concurrent.During the insertion process, each thread will be responsible for inserting a portion of the data into the table. However, depending on the design and configuration of the database, this large-scale data insertion operation could impact the performance of the database. Without proper optimization measures, such as buffering mechanisms, efficient transaction management, or parallel processing techniques, this data insertion operation might lead to anomalies in the database, such as increased write latency, database locking, or even failure to insert all the data properly.It is important to carefully consider the database's architecture and performance characteristics, as well as devise an optimal data insertion strategy, to ensure the smooth execution of these large-scale data insertion operations.\n" + }, + "49": { + "start_time": "1697303995", + "end_time": "1697304067", + "start_timestamp": "2023-10-15 01:19:55", + "end_timestamp": "2023-10-15 01:21:07", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 98\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 100\n \n # Size of each column (in characters)\n column_size = 75\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analysis project, 98 threads are used to simultaneously insert a large amount of data into a database table with 20 columns and 100 rows, each column having a size of 75 characters. This can cause a database exception due to the high volume of data being inserted at the same time.\n", + "desc": "In an Internet of Things (IoT) environment, there is a database designed specifically for collecting and analyzing sensor data. This database, called 'SensorDataDB', stores information from various sensors and is used to track and monitor different environmental parameters. One of the key tables in this database is 'SensorReadings', which contains data from 100 sensors. This table includes fields such as sensor ID, reading type (e.g., temperature, humidity, pressure), reading value, timestamp, sensor location, and status information.In a particular scenario, a large influx of sensor data is generated by these 100 sensors, with each sensor transmitting data at a high frequency. However, due to the lack of optimization techniques such as efficient data partitioning, proper indexing, or buffering mechanisms, the database struggles to handle these concurrent write requests effectively. As a result, the database experiences performance issues, including increased write latency and potential database locking.This anomaly can lead to delayed or failed write operations, affecting the real-time nature of the sensor data being captured. Additionally, it may impact the overall efficiency of the IoT system, making it more challenging to monitor and analyze sensor data accurately.\n" + }, + "50": { + "start_time": "1697304127", + "end_time": "1697304187", + "start_timestamp": "2023-10-15 01:22:07", + "end_timestamp": "2023-10-15 01:23:07", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 161\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 90\n \n # Number of rows to insert\n num_rows = 364\n \n # Size of each column (in characters)\n column_size = 68\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for an online store, 161 users simultaneously attempt to perform frequent update operations in a database table containing 90 columns and 364 rows of product records. Each product record has a column size of 68 characters. These users compete with each other to lock the database table for updates, causing contention and potentially triggering a database exception.\n", + "desc": "In a banking scenario, the database handles customer and transaction data for a bank. Inside the database, there is a table called 'AccountTransactions' that records detailed information about various banking transactions. This table contains 364 rows of data, each representing a transaction record for an account, with a total of 90 columns, each containing information of up to 68 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more.As part of the bank's operations, there might be occasions where multiple users simultaneously attempt frequent update operations on the 'AccountTransactions' table using the provided script. These operations could include updating transaction statuses, modifying transaction amounts, or adding transaction notes. In this case, the script is executed with the parameters '--anomaly LOCK_CONTENTION --threads 161 --ncolumn 90 --colsize 68 --nrow 364'. This means that 161 users simultaneously attempt to perform update operations on the 'AccountTransactions' table, with the specific table schema specified (90 columns, each containing information of up to 68 characters) and a total of 364 rows of data. The aim is to simulate a scenario where there might be contention for locking the database table, which could potentially lead to performance issues.\n" + }, + "51": { + "start_time": "1697304247", + "end_time": "1697304319", + "start_timestamp": "2023-10-15 01:24:07", + "end_timestamp": "2023-10-15 01:25:19", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 130\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 2045737\n \n # Size of each column (in characters)\n column_size = 75\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 130 users simultaneously perform a search after a large-scale data cleaning operation on a database table containing 16 columns, 2,045,737 rows, each column size of 75 characters of commodity records.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a database specifically designed for storing and analyzing sensor data, known as 'SensorDataDB'. This database is used to handle a large volume of data from various sensors. One of the key tables in the database is called 'SensorReadings', which stores detailed information about sensor readings. This table consists of 2,045,737 rows of data, with 16 columns, each containing information of up to 75 characters. These columns may include sensor ID, reading type, reading value, timestamp, location, and status information. In this specific scenario, there is a need to optimize the database performance by performing a VACUUM operation. The VACUUM operation is used to reclaim unused space and improve the efficiency of disk space allocation. When executed with 130 threads, this VACUUM operation can help optimize the storage and performance of the 'SensorDataDB' database.\n" + }, + "52": { + "start_time": "1697304379", + "end_time": "1697304493", + "start_timestamp": "2023-10-15 01:26:19", + "end_timestamp": "2023-10-15 01:28:13", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 83\n \n # Number of rows to insert\n num_rows = 935270\n \n # Size of each column (in characters)\n column_size = 88\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database of an e-commerce platform, there is a scenario where multiple users are performing a query operation using redundant indexes. The specific scenario involves 5 users searching in a database table containing 83 columns and 935,270 rows. Each column has a size of 88 characters. The purpose of this scenario is to simulate the additional storage footprint and performance overhead that arise due to the presence of redundant indexes.\n", + "desc": "In the business intelligence scenario, a database named 'BusinessIntelDB' is used to store and process data for analyzing and generating various business reports. One key table in this database is called 'SalesData', which contains detailed information about sales transactions, such as sales ID, product ID, customer ID, sales date, sales amount, sales channel, and other relevant fields. This table contains 935,270 rows of data, with 83 columns, each capable of storing up to 88 characters. In order to improve the performance of complex queries, such as monthly sales reports, sales by region, or sales by product category, the database administrator might create redundant indexes on the 'SalesData' table. These indexes could be based on sales date, product category, sales channel, or other relevant fields. However, the creation of numerous redundant indexes can lead to additional storage usage and overhead in the database. It can also cause database fragmentation and impact the overall performance of the database. The frequent data retrieval and index operations might result in delayed report generation and hinder the efficiency of the business decision-making process.\n" + }, + "53": { + "start_time": "1697304553", + "end_time": "1697304644", + "start_timestamp": "2023-10-15 01:29:13", + "end_timestamp": "2023-10-15 01:30:44", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users upload, download, or edit files at the same time, the system experiences contention in input/output operations due to the large amount of data being processed simultaneously. This leads to slower file transfer speeds.\n", + "desc": "In an IoT (Internet of Things) scenario, there is a database used for collecting and analyzing sensor data. This database records data from various types of sensors, such as temperature, humidity, pressure, light, and motion sensors. When a large number of sensors start transmitting data simultaneously at a high frequency, the database might experience performance issues due to the increased workload. This can result in slower data processing and potentially lead to I/O contention, where the storage and network bandwidth are strained. As a result, file transfer speeds might be slower, especially when dealing with large files. Additionally, frequent write operations to the database, such as storing sensor readings and updating metadata, can further impact the overall performance of the system.\n" + }, + "54": { + "start_time": "1697304704", + "end_time": "1697304764", + "start_timestamp": "2023-10-15 01:31:44", + "end_timestamp": "2023-10-15 01:32:44", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system, when performing join operations between multiple tables with poor join performance due to a large number of records and limited resources, high CPU contention can occur. This can lead to slower query execution and resource competition among multiple users or processes trying to access the CPU simultaneously.\n", + "desc": "In the Internet of Things scenario, consider an e-commerce platform's database named 'ECommerceDB', which includes a crucial table named 'OrderDetails' for recording detailed order information. This table contains data for millions of orders, each with multiple columns corresponding to order ID, customer ID, product ID, order date, quantity, price, payment method, shipping address, and more. In this database, there might be a situation where joining the 'OrderDetails' table with another related table, such as the 'ProductDetails' table, leads to poor performance. Joining these tables could be necessary to retrieve additional information about the products ordered, such as product name, brand, and category. However, due to inefficient indexing, lack of appropriate join strategies, or large data volumes, the execution of join operations might become slow and resource-intensive. As a result, the database might experience poor join performance, leading to delays in retrieving complete order information. Additionally, this performance issue could consume excessive CPU resources, further impacting the overall system performance and potentially leading to CPU contention.\n" + }, + "55": { + "start_time": "1697304824", + "end_time": "1697304974", + "start_timestamp": "2023-10-15 01:33:44", + "end_timestamp": "2023-10-15 01:36:14", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a database used by an e-commerce platform, there is a scenario where a large amount of data needs to be fetched from the database, specifically related to the inventory of each product. This requires executing correlated subqueries. However, if the subqueries are not optimized, the performance of the inventory querying process may be negatively impacted.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a database named 'SensorDataDB' that is used to collect and analyze data from various sensors. This database contains a table called 'SensorReadings' which stores information about sensor readings such as temperature, humidity, pressure, and motion. This table has a large number of rows, potentially in the millions, each representing a sensor reading. When performing queries that require fetching a large amount of sensor data, such as retrieving all temperature readings in a specific time range or location, the database might encounter performance issues. This is because these queries may involve correlated subqueries, where data from multiple sensors need to be compared or analyzed together. These correlated subqueries can be computationally expensive and result in slow query execution times. Additionally, retrieving a large amount of sensor data from the database can also lead to high I/O usage and potential bottlenecks in the system.\n" + }, + "56": { + "start_time": "1697305034", + "end_time": "1697305107", + "start_timestamp": "2023-10-15 01:37:14", + "end_timestamp": "2023-10-15 01:38:27", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 198\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 8\n \n # Number of rows to insert\n num_rows = 65\n \n # Size of each column (in characters)\n column_size = 38\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an industrial manufacturing process, when 198 machines are simultaneously generating a large amount of data and trying to insert it into the database, the process faces challenges due to the high number of threads and the specific characteristics of the data. This may result in exceptions or slow performance during the data insertion process. The data being inserted consists of 65 rows with 8 columns, each column having a size of 38 characters.\n", + "desc": "In an internet of things (IoT) scenario, there is a database used for collecting and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from sensors. These fields may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. when 198 sensors start transmitting data simultaneously at a high frequency, the database might encounter performance issues. Due to the lack of effective data partitioning in the 'SensorReadings' table, insufficient buffering mechanisms, or improper indexing, the database's ability to process these numerous concurrent write requests is limited. This can lead to increased write latency in the database, and in some cases, may even result in database locking, ultimately leading to anomalies.\n" + }, + "57": { + "start_time": "1697305167", + "end_time": "1697305239", + "start_timestamp": "2023-10-15 01:39:27", + "end_timestamp": "2023-10-15 01:40:39", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 198\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 40\n \n # Number of rows to insert\n num_rows = 93\n \n # Size of each column (in characters)\n column_size = 73\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a real-life scenario, this script represents a situation where 198 devices are generating a large amount of data simultaneously. This data needs to be inserted into a database, which has a table with 40 columns and 93 rows. Each column has a size of 73 characters. The purpose of running this script is to simulate the database exception that can occur due to the high volume of data being inserted at the same time.\n", + "desc": "In an Internet of Things (IoT) scenario, there exists a database specifically used for collecting and analyzing sensor data. This database is named 'SensorDataDB' and is designed to handle a large volume of data from various types of sensors. One primary table within this database is called 'SensorReadings', which contains fields to store data from different sensors. These fields may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion), reading value, timestamp, sensor location, and status information. In a specific case, the script triggers an anomaly related to inserting a large amount of data into the 'SensorReadings' table. The script specifies the following parameters: 198 threads, 40 columns in the table with each column having a size of 73 characters, and a total of 93 rows of new data to be inserted. As a result of this large-scale data insertion, the database might encounter performance issues. Factors such as insufficient buffering mechanisms, improper indexing, or the lack of effective data partitioning in the 'SensorReadings' table could limit the database's ability to process these numerous concurrent write requests efficiently. This could lead to increased write latency, database locking, and ultimately result in anomalies. These anomalies could adversely affect the database's performance and disrupt the smooth operation of the overall IoT system.\n" + }, + "58": { + "start_time": "1697305299", + "end_time": "1697305360", + "start_timestamp": "2023-10-15 01:41:39", + "end_timestamp": "2023-10-15 01:42:40", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 77\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 98\n \n # Number of rows to insert\n num_rows = 380\n \n # Size of each column (in characters)\n column_size = 87\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online marketplace, 77 users are trying to perform frequent update operations on a database table containing 98 columns and 380 rows of product records. Each column has a size of 87 characters. Due to contention for locking the database table, these users are competing with each other, resulting in a database exception.\n", + "desc": "In a banking scenario, there is a database named 'BankingDB' that handles customer and transaction data for a bank. Within this database, there is a key table named 'AccountTransactions' that records information about various banking transactions. This table contains 380 rows of data, each representing a transaction record for an account. It has a total of 98 columns, each capable of storing information of up to 87 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more.At a certain moment, 77 users simultaneously attempt frequent update operations on the 'AccountTransactions' table. These operations involve updating transaction statuses, modifying transaction amounts, or adding transaction notes. Due to the high concurrency and competition among users for locking the database table, there might be a contention issue. This contention can lead to delayed processing or failure of other users' transaction requests, impacting the daily operations of the bank. Additionally, if such incidents occur frequently, they could result in rapid growth of the database transaction log, consume excessive storage space, and possibly cause temporary interruptions in database services.\n" + }, + "59": { + "start_time": "1697305420", + "end_time": "1697305533", + "start_timestamp": "2023-10-15 01:43:40", + "end_timestamp": "2023-10-15 01:45:33", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 60\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 8\n \n # Number of rows to insert\n num_rows = 2799635\n \n # Size of each column (in characters)\n column_size = 73\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online store, there are 60 users performing searches on a table with 8 columns and 2,799,635 rows of product records. Each column can hold up to 73 characters. The searches are done after a vacuum operation, which is a large-scale data cleaning process. This simulation is meant to test for any exceptions caused by the search operation.\n", + "desc": "In a business intelligence scenario, there is a database named 'AnalyticsDB' used for analyzing various aspects of a company's operations. The database contains a key table named 'SalesRecords' that records information about sales transactions, including details such as sales ID, product ID, customer ID, quantity sold, sales amount, sales date, and sales location. This table has a total of 2,799,635 rows, each representing a sales transaction, and consists of 8 columns, with each column having a size of 73 characters. Periodically, the company's database administrator needs to optimize the database performance by running a VACUUM operation. This operation involves reclaiming space occupied by deleted or updated rows in the 'SalesRecords' table, improving query performance, and reducing storage needs. The VACUUM operation will be executed with a concurrency of 60 threads to speed up the process.By performing the VACUUM operation, the database administrator ensures that the 'SalesRecords' table remains efficient and well-managed, preventing unnecessary storage consumption and maintaining optimal query performance. This helps to ensure that the business intelligence analysis and decision-making processes carried out using this database can be performed smoothly and accurately.\n" + }, + "60": { + "start_time": "1697305593", + "end_time": "1697305710", + "start_timestamp": "2023-10-15 01:46:33", + "end_timestamp": "2023-10-15 01:48:30", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 59\n \n # Number of rows to insert\n num_rows = 459312\n \n # Size of each column (in characters)\n column_size = 87\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used by a financial institution, there are 5 users simultaneously executing queries on a table with 59 columns and 459,312 rows of financial transaction records. Each column has a size of 87 characters. The queries involve redundant indexes that were created at the beginning of the operations and deleted after. This process causes additional storage consumption and performance overhead in the database.\n", + "desc": "In a business intelligence scenario, there is a database named 'FinancialAnalysisDB' used for analyzing financial data. This database contains multiple tables, one of which is the 'FinancialData' table, storing data for various financial records. This table consists of 459,312 rows of data, each representing a financial record, with a total of 59 columns, each containing information up to 87 characters. These columns may include transaction ID, transaction type, amount, date, department, project code, budget code, financial year, and audit status.To improve the performance of complex financial queries, the database administrator decides to create redundant indexes on certain columns before executing the queries. These indexes could be based on transaction type, date range, department, project code, or other relevant factors. The purpose of creating redundant indexes is to accelerate query execution and provide faster results.However, creating redundant indexes can have consequences on the database performance and storage usage. The frequent creation and maintenance of indexes might increase the overhead on query processing and potentially lead to database fragmentation. If multiple users simultaneously perform complex financial queries, the database might face increased computational and storage demands, causing potential performance issues and delays in query execution.\n" + }, + "61": { + "start_time": "1697305770", + "end_time": "1697305861", + "start_timestamp": "2023-10-15 01:49:30", + "end_timestamp": "2023-10-15 01:51:01", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users share files, there is contention for input/output (I/O) resources when users simultaneously upload, download, or edit files. This competition slows down the file transfer process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically designed for collecting and analyzing sensor data. This database is used to handle a large volume of data from various types of sensors. However, when multiple sensors start transmitting data simultaneously at a very high frequency, the database might encounter performance issues. Due to limitations in data partitioning, buffering mechanisms, or indexing, the database's ability to process these concurrent write requests is limited. This can lead to increased write latency and potential database locking, resulting in anomalies. These anomalies could affect the overall performance of the IoT system and impact the timely processing of sensor data.\n" + }, + "62": { + "start_time": "1697305921", + "end_time": "1697305981", + "start_timestamp": "2023-10-15 01:52:01", + "end_timestamp": "2023-10-15 01:53:01", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a multi-user system where queries involving complex joins are executed, if there is poor performance due to inefficient or improperly optimized join operations, combined with high CPU contention caused by multiple users competing for CPU resources, it can result in slower query execution times and system performance degradation.\n", + "desc": "In the Internet of Things (IoT) scenario, imagine a smart home automation system that utilizes a database called 'SmartHomeDB'. This database is responsible for storing data from various smart devices installed in a home, such as thermostats, security cameras, motion sensors, and lighting controls. Each device generates data continuously, including temperature readings, camera footage, motion detection events, and lighting status. The 'SmartHomeDB' database maintains a table named 'DeviceData' to store all this information, with each row representing a data entry from a specific device. This table consists of multiple columns, such as device ID, data type, timestamp, device location, and status. In this scenario, the system needs to perform join operations on the 'DeviceData' table to retrieve specific information. For example, it might need to retrieve all temperature readings from a specific room during a certain time period or obtain a complete timeline of events from multiple motion sensors. However, due to poor optimization techniques or insufficient indexing on the join columns, the performance of these join operations might be negatively affected. This can lead to slower query execution times, higher CPU utilization, and overall inefficiency in retrieving the requested data. Such poor join performance can impact the functionality and responsiveness of the smart home automation system, making it less effective in providing timely and accurate information to users.\n" + }, + "63": { + "start_time": "1697306041", + "end_time": "1697306198", + "start_timestamp": "2023-10-15 01:54:01", + "end_timestamp": "2023-10-15 01:56:38", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online shopping platform, there is a problem with fetching large amounts of data and running correlated subqueries. This could occur when trying to retrieve inventory information for numerous products. Without optimizing the subqueries, the performance of the inventory query may be negatively affected.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'SensorDataDB' that stores data collected from various sensors. This database is designed to handle a large volume of sensor data. The key table in this database is called 'SensorReadings', which contains information about the readings from 100 sensors. Each row in the table represents a sensor reading and contains fields such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. When there is a high frequency of data transmission from all 100 sensors simultaneously, the database can encounter performance issues. Due to the lack of effective data partitioning, buffering mechanisms, or proper indexing in the 'SensorReadings' table, the database's ability to process a large number of concurrent write requests is limited. This can result in increased write latency and even database locking, leading to anomalies in the database.\n" + }, + "64": { + "start_time": "1697306259", + "end_time": "1697306331", + "start_timestamp": "2023-10-15 01:57:39", + "end_timestamp": "2023-10-15 01:58:51", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 151\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 93\n \n # Size of each column (in characters)\n column_size = 53\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application where 151 sensors generate a large amount of data, the data needs to be inserted into the database simultaneously. The database will face an exception due to the high volume of concurrent data insertion. The database table contains 5 columns, each with a size of 53 characters, and has 93 rows of data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database designed for collecting and analyzing sensor data, called 'SensorDataDB'. This database handles a large volume of data from various types of sensors. The main table in this database is called 'SensorReadings', which stores information about the readings from these sensors. It consists of 93 rows of data, each representing a sensor reading, with a total of 5 columns, each containing information of up to 53 characters. These columns may include sensor ID, reading type, reading value, timestamp, and sensor location. The 'SensorDataDB' database is designed to handle concurrent insert operations, such as when multiple sensors send data simultaneously. However, in this specific case, 151 threads are executing insert operations concurrently, which might exceed the capacity of the database, causing performance issues. Due to the lack of efficient data buffering or improper indexing, the database's ability to handle these numerous concurrent insert requests might be limited. This can lead to increased insert latency and potentially result in anomalies in the database. These anomalies might manifest as delayed insert operations, database locking, or even failures in inserting new sensor readings. Overall, this situation can impact the effectiveness and responsiveness of real-time data processing in the IoT system.\n" + }, + "65": { + "start_time": "1697306391", + "end_time": "1697306463", + "start_timestamp": "2023-10-15 01:59:51", + "end_timestamp": "2023-10-15 02:01:03", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 151\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 21\n \n # Number of rows to insert\n num_rows = 71\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, where 151 sensors generate a large amount of data that needs to be inserted into the database simultaneously using a script. The database table contains 21 columns and 71 rows of sensor readings, with each column having a size of 70 characters. This process simulates the database exception caused by the simultaneous data insertion.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorData', which stores sensor readings including sensor ID, reading type, value, timestamp, location, and status. In this case, there are 71 rows of sensor data, each row representing a reading, with 21 columns, each containing information of up to 70 characters. Suppose there is a situation where 151 sensors start transmitting data simultaneously at a very high frequency. This could cause performance issues in the database, as it may not be able to efficiently handle the large influx of data. This can lead to increased write latency and potential anomalies in the database. It is important to ensure that the database and its infrastructure are optimized to handle such high-volume data ingestion to maintain data integrity and system efficiency.\n" + }, + "66": { + "start_time": "1697306523", + "end_time": "1697306583", + "start_timestamp": "2023-10-15 02:02:03", + "end_timestamp": "2023-10-15 02:03:03", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 106\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 50\n \n # Number of rows to insert\n num_rows = 396\n \n # Size of each column (in characters)\n column_size = 64\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 106 users simultaneously attempt to perform frequent update operations in a database table containing 50 columns and 396 rows of product records, each with a column size of 64 characters. These users compete with each other to lock the database table and perform the update operations.\n", + "desc": "In a banking scenario, imagine a database named 'BankingDB' that stores customer and transaction data for a bank. Within this database, there is a pivotal table called 'AccountTransactions' that records detailed information about various banking transactions. This table contains 396 rows of data, each representing a transaction record for an account. It consists of 50 columns, with each column containing information of up to 64 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more. During peak banking hours, a high number of users, such as bank staff, automated systems, or customers through an online banking platform, are actively performing frequent update operations on the 'AccountTransactions' table. These update operations can include modifying transaction statuses, updating transaction amounts, or adding transaction notes. In one particular instance, there are 106 users attempting to update the same or adjacent rows of data in this table concurrently. Due to the table's design and the database's locking mechanism, these simultaneous update operations trigger contention for locking the database table, leading to performance issues. This contention for locking, if it lasts for a prolonged period, can cause delayed processing or failure of other users' transaction requests. This, in turn, affects the daily operations of the bank. Additionally, if such incidents occur frequently, the size of the database transaction log may grow rapidly, consuming excessive storage space and potentially causing temporary interruptions in database services.\n" + }, + "67": { + "start_time": "1697306643", + "end_time": "1697306740", + "start_timestamp": "2023-10-15 02:04:03", + "end_timestamp": "2023-10-15 02:05:40", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 89\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2765909\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the back-end database of an online platform, 89 users are performing a search operation simultaneously, using 5 columns in a table with 2,765,909 rows. Each column can store up to 70 characters. The search operations are performed after a database maintenance operation called \"VACUUM\", which may cause a delay or exception in the database due to the large amounts of data being processed.\n", + "desc": "In the Internet of Things (IoT) scenario of sensor data collection and analysis, there is a database named 'SensorDataDB'. This database is responsible for storing and processing data from various sensors. One of the key tables in this database is called 'SensorReadings', which contains detailed information about sensor readings. This table consists of a large number of rows (2,765,909 in this case), each representing a reading from a sensor. The table has 5 columns, each with a maximum size of 70 characters. These columns may include sensor ID, reading type, reading value, timestamp, and sensor location. In this scenario, the database administrator needs to optimize the performance of the 'SensorReadings' table by executing a VACUUM operation. The VACUUM operation is used to reclaim unused space and improve the efficiency of database operations. During the VACUUM operation, the database might move rows to reduce fragmentation, update statistics, and free up space. The VACUUM operation can help improve the performance of the database when dealing with a large number of rows and frequent data updates.By executing the given command, the VACUUM operation will be performed on the 'SensorReadings' table with a concurrency level of 89 threads. This means that the VACUUM operation will be executed in parallel using multiple threads, which can help improve the efficiency and speed of the operation. The VACUUM operation will also consider the specific column and row sizes mentioned, as well as the total number of rows, to optimize the execution.Overall, the VACUUM operation in this scenario aims to optimize the performance of the database by reclaiming space, reducing fragmentation, and improving query and data manipulation efficiency for the 'SensorReadings' table in the 'SensorDataDB' database.Imagine a business intelligence scenario where there is a database named 'BusinessIntelDB' used for analyzing and generating reports based on various business data. This database contains multiple tables, one of which is a key table named 'DataRecords'. The 'DataRecords' table contains a large volume of data, with 427,732 rows, each representing a specific data record. It has 76 columns, each capable of storing up to 100 characters. These columns may include data ID, data type (such as sales, expenses, profits, market trends), data value, date and time, source department, geographical region, and more.In this scenario, the database administrator frequently encounters queries that require complex analysis and reporting on the 'DataRecords' table. To optimize the query performance, the administrator might create multiple indexes on the table based on different data types, time ranges, or geographical factors. However, because of the large number of indexes, there is a risk of creating redundant indexes, which are not truly necessary for the specific queries being executed.To investigate and better understand the impact of these redundant indexes, the administrator runs a script with the provided command. By setting the '--anomaly' parameter to 'REDUNDANT_INDEX', the script triggers scenarios related to redundant index creation and its potential impact on database performance.In this particular command, the script is configured to run with 6 concurrent threads, each representing a user or a query request attempting to utilize the indexes. The '--ncolumn' parameter is set to 76, indicating the number of columns in the table, and the 'colsize' parameter is set to 100, indicating the maximum size of each column. The 'nrow' parameter is set to 427,732, indicating the total number of rows in the 'DataRecords' table.By running this script, the administrator aims to observe and analyze the performance impact of redundant index creation and assess whether any optimizations or adjustments need to be made to improve the overall efficiency of the database in this business intelligence scenario. \n" + }, + "68": { + "start_time": "1697306800", + "end_time": "1697306914", + "start_timestamp": "2023-10-15 02:06:40", + "end_timestamp": "2023-10-15 02:08:34", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 76\n \n # Number of rows to insert\n num_rows = 427732\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used by an online retailer, a redundant index is created for a large dataset containing 76 columns and 427,732 rows. This index is unnecessary and will result in additional storage usage and performance overhead.\n", + "desc": "In an IoT scenario, imagine a database named 'IoTDataDB' that is used for collecting and analyzing data from various IoT devices. This database is designed to handle a large volume of sensor data and contains a primary table called 'SensorReadings' to store this data. Each row in the table represents a reading from a sensor and includes information such as sensor ID, reading type, value, timestamp, location, and status.Now, suppose multiple IoT devices start transmitting data simultaneously at a high frequency. This can create a situation where there is a large influx of data being written to the 'SensorReadings' table. Due to the high concurrency of data writes, the database might face I/O contention issues. This means that the storage and network bandwidth of the system are strained, causing slower data transfer speeds and impacting database performance.Additionally, frequent write operations in the database can lead to locking and transaction management issues during peak periods. This can further slow down data processing and impact the recording of sensor readings. As a result, anomalies related to I/O contention can occur in the database and affect the overall efficiency of the IoT system.\n" + }, + "69": { + "start_time": "1697306974", + "end_time": "1697307065", + "start_timestamp": "2023-10-15 02:09:34", + "end_timestamp": "2023-10-15 02:11:05", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are uploading, downloading, or editing files at the same time. This creates competition for input/output operations, causing a slowdown in file transfers.\n", + "desc": "In a business intelligence scenario, there is a large database named 'BusinessAnalyticsDB' that stores data for analysis and reporting purposes. This database contains multiple tables, one of which is the 'SalesData' table that stores detailed information about sales transactions. This table contains millions of rows, each representing a sales transaction, with various columns such as transaction ID, customer ID, product ID, quantity, price, date, and more. During a peak period, when multiple users are running complex queries that involve joining the 'SalesData' table with other tables, the database might experience poor join performance. This poor performance could be due to the lack of proper indexing or optimization in the join conditions, resulting in slow query execution and delays in generating reports. Additionally, if the database server does not have sufficient CPU processing power to handle the high computational demands of these join queries, CPU contention can occur, further impacting the performance of the database. This can result in increased query response times, slower report generation, and overall reduced efficiency in the business analytics process.\n" + }, + "70": { + "start_time": "1697307125", + "end_time": "1697307185", + "start_timestamp": "2023-10-15 02:12:05", + "end_timestamp": "2023-10-15 02:13:05", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics system, multiple users are performing a join operation on a large dataset. However, the join performance is poor due to inefficient query optimization and lack of necessary indices. Additionally, the system is facing CPU contention as multiple users compete for computing resources, leading to delays in query execution.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'SensorDataDB' that is used to collect and analyze sensor data. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which stores information about sensor readings including sensor ID, reading type, value, and timestamp. In this scenario, there is a need to perform complex queries that involve related subqueries, such as retrieving the average sensor reading value for a specific type of sensor. However, when dealing with a large number of sensor readings, executing individual subqueries for each reading can become time-consuming and inefficient. This is because the database needs to read a significant amount of data from the disk, which can result in I/O bottlenecks and slow down the query performance.\n" + }, + "71": { + "start_time": "1697307245", + "end_time": "1697307386", + "start_timestamp": "2023-10-15 02:14:05", + "end_timestamp": "2023-10-15 02:16:26", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online shopping platform's database, retrieving a large amount of data using related subqueries can lead to a degradation in performance. This degradation occurs when querying inventory for each product, especially when there is a large number of products.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a specialized database called 'IoTDataDB' that is used to collect and process data from various IoT devices. This database is designed to handle a large volume of data generated by these devices. One of the key tables in this database is called 'DeviceData', which stores the data collected from individual devices. Each row in the 'DeviceData' table represents a data record from a specific device, with a total of 9 columns. These columns might include device ID, sensor type, sensor value, timestamp, location, device status, battery level, firmware version, and connectivity information. The size of each column is limited to 23 characters. In a scenario where 55 devices are consistently transmitting data at a high frequency, for example, measuring temperature and humidity readings every second, the database might experience performance issues due to the large volume of concurrent write operations. If the database is not optimized to handle this large influx of data, it can lead to high write latency and possible database locking. This can result in anomalies in the database, affecting the real-time monitoring and analysis of IoT data and potentially impacting the functionality of the entire IoT system.\n" + }, + "72": { + "start_time": "1697307446", + "end_time": "1697307517", + "start_timestamp": "2023-10-15 02:17:26", + "end_timestamp": "2023-10-15 02:18:37", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 55\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 70\n \n # Size of each column (in characters)\n column_size = 23\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 55 devices are all trying to send a large amount of data to a central database simultaneously. Each device has 9 data fields, with each field being 23 characters long, and there are a total of 70 data entries. This process may cause the database to encounter an exception due to the high volume of data being inserted at once.\n", + "desc": "In the Internet of Things (IoT) scenario, imagine a database specifically designed to handle data from various types of sensors, called 'SensorDataDB'. This database is used to collect and analyze sensor data in smart home environments. Within this database, there is a crucial table named 'SensorReadings', which stores information about sensor readings. This table consists of 95 rows, each representing a unique sensor reading, and it contains 28 columns, each capable of storing information up to 54 characters long. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this scenario, the database faces performance challenges when dealing with large amounts of data being inserted. When 55 sensors simultaneously transmit data at a high frequency, it puts a strain on the database's ability to handle the concurrent write requests efficiently. This strain is primarily due to insufficient buffering mechanisms, improper indexing, or the absence of effective data partitioning in the 'SensorReadings' table. These limitations in handling the concurrent write requests can result in increased write latency, affecting the overall performance of the database and potentially causing anomalies.\n" + }, + "73": { + "start_time": "1697307577", + "end_time": "1697307648", + "start_timestamp": "2023-10-15 02:19:37", + "end_timestamp": "2023-10-15 02:20:48", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 55\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 28\n \n # Number of rows to insert\n num_rows = 95\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, 55 sensors generate a large amount of data that needs to be inserted into a database simultaneously. This data consists of 28 columns, with each column storing 54 characters, and there are a total of 95 rows. This process simulates a database exception caused by the insertion of such large amounts of data.\n", + "desc": "In an Internet of Things (IoT) environment, there is a database designed specifically for collecting and analyzing sensor data. This database is called 'SensorDataDB' and it is used to store data from various types of sensors. One of the key tables in this database is 'SensorReadings', which contains information about different sensor readings. This table consists of 338 rows of data, with each row representing a reading from a sensor. The table has 61 columns, each containing information of up to 63 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. During a period of high sensor activity, where 167 sensors are transmitting data simultaneously, there might be a contention for locks in the database. This contention could occur if multiple users or systems are trying to update or access the same data in the 'SensorReadings' table at the same time. If this contention persists for a significant duration, it could result in performance issues, such as delays in updating data or accessing the database. It is important to address these lock contention anomalies to ensure smooth operations in the IoT environment.\n" + }, + "74": { + "start_time": "1697307708", + "end_time": "1697307768", + "start_timestamp": "2023-10-15 02:21:48", + "end_timestamp": "2023-10-15 02:22:48", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 167\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 61\n \n # Number of rows to insert\n num_rows = 338\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online database system, there are 167 users simultaneously trying to perform frequent update operations on a database table containing 61 columns and 338 rows of records. Each column has a size of 63 characters. The users are competing to lock the database table while performing the updates. This simulation aims to trigger a database exception due to the contention for locking in the system.\n", + "desc": "In a common scenario of an Internet of Things (IoT) environment, there exists a database named 'IoTDataDB', which is designed to handle a massive amount of data generated by various IoT devices. This database stores sensor data collected by these devices, including data such as temperature, humidity, pressure, light, motion, and more. The primary table in this database is called 'SensorData', which contains 3,555,214 rows of data, each representing a specific data reading from a sensor. Each row in the table consists of 9 columns, with each column capable of storing up to 54 characters of data. These columns may include attributes such as sensor ID, data type, data value, timestamp, location, device ID, and more.In this particular scenario, a vacuum operation needs to be performed on the 'SensorData' table. This operation involves optimizing the table's storage and performance by reclaiming unused space and rearranging the data to improve query performance. Since the dataset is quite large, with millions of rows, and each row containing several columns of data, the vacuum operation could take a considerable amount of time to complete. During this time, the database might experience some performance degradation, such as slower response times for other database operations or temporary unavailability of specific services. Therefore, it is essential to plan and execute the vacuum operation during low traffic periods to minimize the impact on overall system performance and user experience in the IoT environment.\n" + }, + "75": { + "start_time": "1697307828", + "end_time": "1697307943", + "start_timestamp": "2023-10-15 02:23:48", + "end_timestamp": "2023-10-15 02:25:43", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 78\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 3555214\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of an e-commerce platform, 78 users simultaneously perform a search operation after a large-scale data cleaning operation on a table containing 9 columns, 3,555,214 rows, where each column has a size of 54 characters. This simulates the database exception that can occur in this scenario.\n", + "desc": "In a business intelligence scenario, involving the financial data analysis of large corporations, a database named 'CorporateFinanceDB' is utilized. This database contains multiple tables, including a key table called 'FinancialRecords', which stores detailed financial transactions and statement information. In this specific statement, the database administrator triggers an anomaly related to redundant index creation. The script is executed with the anomaly parameter 'REDUNDANT_INDEX', along with other parameters such as the number of threads set to 9, the number of columns in the table set to 95, the size of each column set to 57 characters, and the total number of rows in the table set to 606,760. This scenario simulates frequent creation and deletion of indexes for various complex financial queries. However, the repetitive index operations can consume additional storage space and impose performance overhead on the database, ultimately impacting the efficiency of the business intelligence analysis process.\n" + }, + "76": { + "start_time": "1697308003", + "end_time": "1697308119", + "start_timestamp": "2023-10-15 02:26:43", + "end_timestamp": "2023-10-15 02:28:39", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 95\n \n # Number of rows to insert\n num_rows = 606760\n \n # Size of each column (in characters)\n column_size = 57\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial institution's database, 9 users perform a query on a table with 95 columns and 606,760 rows. Each column has a size of 57 characters. The query involves redundant indexes on attributes such as customer ID, transaction amount, and transaction date. This can result in additional storage consumption and performance overhead.\n", + "desc": "In an Internet of Things scenario, there is a database used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database contains information from 100 sensors, including sensor ID, reading type, reading value, timestamp, sensor location, and status. During high-frequency data transmission from these sensors, the database might experience performance issues. This is due to factors such as inadequate data partitioning, insufficient buffering mechanisms, or improper indexing, which limit the database's ability to process concurrent write requests. As a result, write latency increases, and in some cases, database locking can occur, causing anomalies. These anomalies can affect the overall performance of the system and result in delayed or failed data processing.\n" + }, + "77": { + "start_time": "1697308179", + "end_time": "1697308270", + "start_timestamp": "2023-10-15 02:29:39", + "end_timestamp": "2023-10-15 02:31:10", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are uploading, downloading, or editing files simultaneously, the system experiences contention for input/output resources. This results in slower file transfer speeds.\n", + "desc": "In the Internet of Things (IoT) context, a database called 'IoTDataDB' is used to store data collected from various sensors. This database contains a key table called 'SensorReadings', which stores detailed information about the sensor readings. The table consists of multiple rows of data, with each row representing a reading from a sensor. These readings include information such as sensor ID, reading type, reading value, timestamp, sensor location, and status. At a certain point, there is a high demand for performing complex join operations on this 'SensorReadings' table, involving large datasets and multiple tables. These join operations require data from the 'SensorReadings' table to be combined with data from other tables, based on common key fields. However, due to poor indexing or inefficient query optimization strategies, the database's performance in handling these join operations is suboptimal. This can result in slow query execution, increased CPU usage, and contention among multiple query threads. Consequently, the overall system performance might be affected, leading to delays in processing sensor data and potential inaccuracies in data analysis or decision-making.\n" + }, + "78": { + "start_time": "1697308330", + "end_time": "1697308390", + "start_timestamp": "2023-10-15 02:32:10", + "end_timestamp": "2023-10-15 02:33:10", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a business intelligence system, multiple complex join operations between large tables are performed. This puts a heavy load on the CPU, causing contention issues. The performance of join operations deteriorates, leading to slow query execution.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database called 'SensorDataDB' used for collecting and analyzing sensor data. This database stores information from various types of sensors and contains a table called 'SensorReadings' where the data is recorded. Each row in this table represents a sensor reading and includes fields such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. When conducting queries that involve fetching a large amount of sensor data, particularly related to correlated subqueries, the database might encounter performance issues. For example, if a query requires retrieving and analyzing data from multiple sensors and their readings, the database might need to perform complex subqueries that involve joining and aggregating data from different tables. These subqueries can be time-consuming and result in slower query execution times. Additionally, due to the large volume of data being processed, there might be I/O bottlenecks during the retrieval process, causing further delays in fetching the required data.\n" + }, + "79": { + "start_time": "1697308450", + "end_time": "1697308591", + "start_timestamp": "2023-10-15 02:34:10", + "end_timestamp": "2023-10-15 02:36:31", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a scenario where an e-commerce platform database is being used, there is a need to fetch a large amount of data related to inventory for each product. This requires executing correlated subqueries, which can be time-consuming and impact the performance of querying inventory, especially when dealing with a large number of products.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'SensorDataDB' that is used to collect and analyze sensor data. This database is designed to handle a large volume of data from different types of sensors. One of the key tables in this database is 'SensorReadings', which stores data from 88 sensors. Each row in this table represents a reading from a sensor and contains 16 columns, with each column capable of storing up to 59 characters of information. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this specific case, there are 188 users simultaneously sending data from their sensors to the database at a high frequency. Due to the lack of effective data partitioning, buffering mechanisms, or proper indexing, the database might experience performance issues while handling these concurrent write requests. This could lead to increased write latency and potentially cause anomalies in the data.\n" + }, + "80": { + "start_time": "1697308651", + "end_time": "1697308723", + "start_timestamp": "2023-10-15 02:37:31", + "end_timestamp": "2023-10-15 02:38:43", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 188\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 88\n \n # Size of each column (in characters)\n column_size = 59\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data processing system, 188 sensors simultaneously generate large amounts of data that need to be inserted into a database. This process could cause a database exception if not properly handled. The data consists of 16 columns, each with a size of 59 characters, and there are 88 rows in total.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database designed to handle large volumes of data from various sensors, titled 'SensorDataDB'. It stores sensor readings, including values such as temperature, humidity, pressure, light, and motion. The database contains a table called 'SensorReadings' with 100 columns and 100 rows of data. Each column can store up to 72 characters of information. In this particular case, 188 threads are attempting to simultaneously insert 100 new rows of sensor readings into the database. However, due to the high number of concurrent insertions and the lack of proper optimization techniques such as buffering or data partitioning, the database may experience performance issues. This can result in increased write latency, delays in the insertion process, or even locking of the database. Such anomalies can negatively impact the overall efficiency and functionality of the IoT system.\n" + }, + "81": { + "start_time": "1697308783", + "end_time": "1697308855", + "start_timestamp": "2023-10-15 02:39:43", + "end_timestamp": "2023-10-15 02:40:55", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 188\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 38\n \n # Number of rows to insert\n num_rows = 100\n \n # Size of each column (in characters)\n column_size = 72\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, a large amount of data generated by 188 sources needs to be inserted into the database simultaneously. This can result in a database exception due to the high volume of data being processed.\n", + "desc": "In an Internet of Things (IoT) application scenario, there is a database named 'IoTDataDB' that is used to store sensor data collected from various IoT devices. The database contains a table called 'SensorReadings', which stores information about the readings from these sensors. The table has a total of 341 rows, each representing a reading from a sensor, and it has 74 columns, each column containing up to 77 characters of data. These columns might include sensor ID, sensor type, sensor value, timestamp, location, and other details. In this scenario, 122 devices are transmitting sensor data to the database simultaneously. Due to the high concurrency of these write operations, there might be contention for database locks, resulting in performance issues for the database. This contention can lead to delayed processing or failure of other write operations, affecting the overall efficiency and reliability of the IoT application.\n" + }, + "82": { + "start_time": "1697308916", + "end_time": "1697308976", + "start_timestamp": "2023-10-15 02:41:56", + "end_timestamp": "2023-10-15 02:42:56", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 122\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 74\n \n # Number of rows to insert\n num_rows = 341\n \n # Size of each column (in characters)\n column_size = 77\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a large company, 122 employees are simultaneously trying to perform frequent update operations on a database table containing 74 columns and 341 rows of records. Each record has a column size of 77 characters. Due to the high number of users competing to lock the database table, a database exception occurs.\n", + "desc": "In the Internet of Things (IoT) scenario, there is a database for storing and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in the database is called 'SensorReadings', which contains information about different readings from sensors. This table consists of 3,143,893 rows of data, each representing a reading from a sensor, with 16 columns, each containing information of up to 72 characters. These columns may include sensor ID, sensor type, reading value, timestamp, location, and other related attributes. During the operation of the IoT system, the database administrator may need to perform a periodic data cleanup operation to remove outdated or irrelevant sensor readings. This process is commonly referred to as vacuuming the database. In this particular case, the administrator runs a script that triggers a vacuum operation on the 'SensorReadings' table. The script is executed with 169 threads, which means that the vacuum operation will be parallelized to speed up the process. The purpose of this vacuum operation is to reclaim the space occupied by the deleted or updated data and optimize the performance of the database. However, if the vacuum operation is not properly optimized or if it is performed during peak usage hours, it could potentially impact the performance of other database operations and introduce anomalies in the system. Specifically, the high number of threads and the large amount of data being processed could lead to resource contention and slower query response times. Therefore, it is recommended to carefully plan and schedule vacuum operations in order to minimize their impact on the overall system performance.\n" + }, + "83": { + "start_time": "1697309036", + "end_time": "1697309076", + "start_timestamp": "2023-10-15 02:43:56", + "end_timestamp": "2023-10-15 02:44:36", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 169\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 3143893\n \n # Size of each column (in characters)\n column_size = 72\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online platform, a large number of users are searching for products using various criteria such as product name, category, and price range. The database table contains 16 columns and 3,143,893 rows, with each column having a size of 72 characters. However, there is an issue with the database where it needs to be cleaned up and optimized. This script simulates the database exception that occurs when 169 users simultaneously perform a search after the VACUUM operation is performed on the database table.\n", + "desc": "In the business intelligence scenario, we can imagine a database called 'BusinessAnalyticsDB' that is dedicated to storing and analyzing data for business intelligence purposes. This database contains multiple tables, one of which is a key table named 'SalesRecords' that stores detailed information about sales transactions. In this case, the 'SalesRecords' table consists of 911,621 rows of data, each representing a sales transaction, with a total of 83 columns. These columns contain different information related to the sales transaction, such as transaction ID, customer ID, product ID, quantity, price, discount, payment method, sales date, sales region, and more. To improve the efficiency of analytical queries, the database administrator decides to create redundant indexes on various columns in the 'SalesRecords' table. These indexes are created to speed up complex queries, such as sales by region, sales by product category, or sales by customer segment. However, creating redundant indexes without careful planning and analysis can lead to increased storage requirements and additional performance overhead in the database.When 10 users simultaneously perform complex analytical queries on the 'SalesRecords' table, the database may experience performance issues due to the presence of redundant indexes. These issues can result in slower query execution times, increased disk input/output operations, and potential locking or contention problems. Ultimately, these anomalies can impact the overall performance and efficiency of the business intelligence analysis in the database.\n" + }, + "84": { + "start_time": "1697309136", + "end_time": "1697309205", + "start_timestamp": "2023-10-15 02:45:36", + "end_timestamp": "2023-10-15 02:46:45", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 83\n \n # Number of rows to insert\n num_rows = 911621\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database system for an online marketplace, 10 users are conducting a query operation on a database table containing 83 columns and 911,621 rows of data. Each column has a size of 51 characters. However, these users are also creating redundant indexes for items such as product name, category, and price range during the query, which can lead to additional storage usage and performance overhead.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data. This database, called 'SensorDataDB', is designed to handle a large volume of data from various types of sensors. One of the primary tables in the database is named 'SensorReadings', which contains fields to store data from these sensors, such as sensor ID, reading type, and reading value. In this specific scenario, there is a need to insert a large amount of data into the 'SensorReadings' table. This large data insertion is done simultaneously by multiple users or sensors, causing high concurrency in the database. However, due to the lack of proper optimization techniques such as data partitioning or buffering mechanisms, the database's ability to handle these concurrent write requests is limited. As a result, the database may encounter performance issues and I/O (input/output) contention, leading to slower write speeds and potential delays in processing the incoming data. This can impact the real-time monitoring and analysis of sensor data, as well as the overall efficiency of the IoT system.\n" + }, + "85": { + "start_time": "1697309265", + "end_time": "1697309356", + "start_timestamp": "2023-10-15 02:47:45", + "end_timestamp": "2023-10-15 02:49:16", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files simultaneously, there is a high volume of data being uploaded, downloaded, or edited. This creates contention for input/output operations, resulting in slower file transfers.\n", + "desc": "In the scenario of an Internet of Things (IoT) application, there is a database named 'SensorAnalyticsDB' that stores and analyzes sensor data collected from various IoT devices. The database includes a table called 'SensorReadings', which contains data from multiple sensors. Each row in the table represents a reading from a specific sensor and includes information such as sensor ID, timestamp, reading value, location, and type of sensor (e.g., temperature, humidity, pressure). In this scenario, there is a performance issue related to joining multiple tables in a query. The database administrator wants to improve the efficiency of joining the 'SensorReadings' table with other related tables, such as 'DeviceInformation' and 'LocationDetails'. These tables contain additional information about the sensors and their locations. Due to poor indexing, inefficient query plans, or improper database configuration, the database's performance is adversely affected when joining these tables. This can lead to slow query execution times and increased CPU contention. The high CPU utilization caused by the slow join operations can impact other database operations and reduce the overall system performance. The use of the 'anomaly_trigger' script with the 'POOR_JOIN_PERFORMANCE' and 'CPU_CONTENTION' anomalies helps to simulate this scenario and identify potential performance issues related to joining tables in the IoT database.\n" + }, + "86": { + "start_time": "1697309416", + "end_time": "1697309476", + "start_timestamp": "2023-10-15 02:50:16", + "end_timestamp": "2023-10-15 02:51:16", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a business intelligence system, multiple users are performing join operations on large datasets simultaneously. The system experiences poor performance in the join operations, and there is competition for CPU resources among the users, resulting in slower execution.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a centralized database named 'IoTDataDB', which is responsible for collecting and storing data from various IoT devices. This database consists of multiple tables, including a key table named 'SensorData', which records sensor readings from different devices. Each sensor reading contains information such as device ID, sensor type, reading value, timestamp, and location. The 'SensorData' table may contain millions or even billions of records, as it continuously receives data from a large number of sensors.Retrieving such a large amount of data from the 'SensorData' table can be a time-consuming process. The database might need to read a significant amount of data from disk, which can result in slower query execution and response times. The size of the data being fetched and the complexity of the queries can significantly impact the performance of the database.The queries being performed on the 'SensorData' table involve correlated subqueries. Correlated subqueries are subqueries that depend on the results of the outer query. In this case, the complex queries performed by the data analyst might include subqueries that depend on the outer query's results to retrieve and process the required data. However, using correlated subqueries in large-scale data retrieval scenarios can lead to inefficient query execution. The database might need to execute multiple subqueries for each row returned by the outer query, resulting in increased query execution times and reduced performance.Overall, the combination of the 'FETCH_LARGE_DATA' and 'CORRELATED_SUBQUERY' anomalies in this IoT data analysis scenario can lead to slower query execution and reduced performance of the database. It is important to consider optimizing the queries, indexing the relevant columns, and implementing caching or other performance enhancement techniques to mitigate the impact of these anomalies on the data analysis process.\n" + }, + "87": { + "start_time": "1697309536", + "end_time": "1697309695", + "start_timestamp": "2023-10-15 02:52:16", + "end_timestamp": "2023-10-15 02:54:55", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online store's database, retrieving a large amount of data for each product, such as inventory levels, may involve executing related subqueries. If these subqueries are not optimized, the performance of querying inventory may be negatively affected.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the primary tables in this database is 'SensorReadings', which stores readings from sensors, such as temperature, humidity, pressure, light, and motion. Each sensor reading is stored as a row in this table, and the table has a total of 5 columns, with each column having a size of 69 characters. These columns may include sensor ID, reading type, reading value, timestamp, and sensor location. Suppose there are 65 rows of data in the 'SensorReadings' table, representing readings from multiple sensors over a period. Now, let's consider a scenario where 146 sensors start transmitting data simultaneously at a very high frequency. Due to the lack of effective data partitioning in the 'SensorReadings' table or insufficient buffering mechanisms, the database might face performance issues in handling such a large number of concurrent write requests. This can result in increased write latency, which can cause anomalies in the database. These anomalies might impact the database's ability to process new sensor readings efficiently, leading to delays or failures in data ingestion. Additionally, it can also affect the accuracy and timeliness of data analysis and processing in the entire IoT system.\n" + }, + "88": { + "start_time": "1697309756", + "end_time": "1697309763", + "start_timestamp": "2023-10-15 02:55:56", + "end_timestamp": "2023-10-15 02:56:03", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 146\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 65\n \n # Size of each column (in characters)\n column_size = 69\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application like an IoT system, a large number of data generated by 146 sensors needs to be inserted into the database simultaneously. Each data point contains 5 columns, with each column having a size of 69 characters. The database might experience an exception due to the high volume of data being inserted at the same time.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically designed for collecting and analyzing sensor data from various devices, named 'SensorDataDB'. This database stores information from 94 sensors, with each row representing a new data entry. The 'SensorDataDB' contains 33 columns, each with a size of 69 characters, to store sensor data such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. When 146 devices start transmitting data simultaneously at a high frequency, the database may encounter performance issues. Due to the lack of efficient data partitioning, buffering mechanisms, or proper indexing in the 'SensorDataDB', it may struggle with handling a large number of concurrent write requests. This can result in increased write latency, impacting the database's ability to process data efficiently and potentially leading to anomalies.\n" + }, + "89": { + "start_time": "1697309823", + "end_time": "1697309895", + "start_timestamp": "2023-10-15 02:57:03", + "end_timestamp": "2023-10-15 02:58:15", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 146\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 33\n \n # Number of rows to insert\n num_rows = 94\n \n # Size of each column (in characters)\n column_size = 69\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data logging system, 146 sensors are generating a large amount of data simultaneously. This data needs to be inserted into a database table consisting of 33 columns and 94 rows, with each column having a size of 69 characters. The purpose of this script is to simulate the database exception that can occur due to the simultaneous insertion of such a large amount of data.\n", + "desc": "In a life scenario involving a bank, there is a database named 'BankDB' that stores customer account information and transaction records. One of the key tables in this database is 'TransactionRecords', which contains detailed information about various banking transactions. This table consists of 375 rows of data, with each row representing a transaction record. There are 55 columns in this table, each containing information of up to 99 characters. These columns may include transaction ID, account number, transaction type, transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more.During a particular period, 125 users simultaneously attempt frequent update operations on the 'TransactionRecords' table. These operations could involve modifying transaction statuses, updating transaction amounts, or adding transaction notes. However, due to the high level of concurrency and the locking mechanism in the database, these concurrent update operations may result in contention for locking the database table. This contention can cause locking delays, leading to slower processing of transaction requests and potential failures in completing transactions. The duration of this locking contention is not specified in the command.\n" + }, + "90": { + "start_time": "1697309955", + "end_time": "1697310015", + "start_timestamp": "2023-10-15 02:59:15", + "end_timestamp": "2023-10-15 03:00:15", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 125\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 55\n \n # Number of rows to insert\n num_rows = 375\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online store, 125 users attempt to perform frequent update operations simultaneously. The database table contains 55 columns and 375 rows of product records, with each column having a size of 99 characters. These users compete with each other to lock the database table for performing update operations. Simulate the database exception caused by this process.\n", + "desc": "In an e-commerce scenario, there is an online marketplace database called 'EcommerceMarketplaceDB' that stores information about various products available for sale. Within the database, there is a table named 'ProductDetails' which contains detailed information about each product. This table has 2,075,265 rows of data, with each row representing a unique product entry. The table contains 5 columns, each with a size of 69 characters. These columns include product ID, name, price, stock quantity, and category. An anomaly is triggered when the database administrator needs to perform a VACUUM operation. This VACUUM operation is necessary to reclaim storage space and optimize the performance of the database. However, due to the large size of the 'ProductDetails' table and the high number of concurrent transactions, performing the VACUUM operation with 144 threads can cause performance issues and potential anomalies in the database. Without proper optimization measures, such as batch processing or scheduling the operation during low traffic periods, this VACUUM operation could lead to delays or failures in other database operations and impact the overall efficiency of the e-commerce platform.\n" + }, + "91": { + "start_time": "1697310075", + "end_time": "1697310166", + "start_timestamp": "2023-10-15 03:01:15", + "end_timestamp": "2023-10-15 03:02:46", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 144\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2075265\n \n # Size of each column (in characters)\n column_size = 69\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, if there are 144 users simultaneously performing a search operation after a large-scale data cleaning operation on a database table containing 5 columns, 2,075,265 rows, each column size of 69 characters of product records, an exception is caused due to the competition for database resources.\n", + "desc": "In a business intelligence scenario, particularly in the context of financial data analysis for large companies, there is a database called 'CorporateFinanceDB'. This database is dedicated to storing and processing financial information for different corporations. It contains various tables, one of which is the 'FinancialRecords' table. This table maintains records of financial transactions and statement information, and it consists of a total of 577,945 rows, each representing a financial record. The table has 92 columns, each capable of storing up to 84 characters of data. These columns encompass information such as transaction ID, transaction type (income, expenditure, assets, liabilities), transaction amount, date, department, project code, budget code, financial year, and audit status. In the context of this dataset, executing an a large number of redundant index creation statements may occur. This can be caused by the need to expedite queries related to financial analysis. Such index creation may be performed to accelerate various complex queries, including departmental budget analysis, quarterly income reports, or annual audits. However, these redundant indexes may lead to additional storage usage and performance overhead within the database. The process of creating and deleting indexes frequently can result in database fragmentation and affect overall performance. Consequently, generating reports based on these financial records could be delayed, thus impacting the decision-making efficiency in a business intelligence environment.\n" + }, + "92": { + "start_time": "1697310226", + "end_time": "1697310341", + "start_timestamp": "2023-10-15 03:03:46", + "end_timestamp": "2023-10-15 03:05:41", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 7\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 92\n \n # Number of rows to insert\n num_rows = 577945\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace's database, there are 7 users searching for products in a table containing 92 columns and 577,945 rows. Each column has a size of 84 characters. However, there are redundant indexes created at the beginning of the search operation, which may lead to additional storage usage and performance overhead.\n", + "desc": "In a file sharing system scenario, we can visualize a database named 'TeamFileShareDB', which is used by teams or organizations for sharing files. This database not only stores the files themselves but also records the metadata of the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users might be simultaneously uploading, downloading, or editing files. For example, a project team is collaborating to complete an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to such high concurrency in file operations, the database 'TeamFileShareDB' faces challenges of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth might be significantly strained. This I/O contention can lead to slower file transfer speeds, especially in situations of limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database (such as file uploads and metadata updates) can impact database performance. During peak periods, the database might encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "93": { + "start_time": "1697310401", + "end_time": "1697310492", + "start_timestamp": "2023-10-15 03:06:41", + "end_timestamp": "2023-10-15 03:08:12", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users upload, download, or edit files simultaneously, there is a contention for input/output operations. This results in slower file transfers.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that collects and analyzes data from various IoT sensors. One of the key tables in this database is called 'SensorData', which stores data from different sensors. For example, it may include sensor ID, sensor type, sensor value, timestamp, location, and other relevant information. In this scenario, the system is experiencing poor join performance when performing queries that involve joining multiple tables, such as joining the 'SensorData' table with other related tables. This poor join performance could be due to factors such as inefficient query plans, lack of appropriate indexes or statistics, or mismatched data types between join columns. As a result, when executing queries that require joining tables, the database might experience significant delays or even timeouts, negatively impacting the overall system performance. Additionally, there might be CPU contention issues, where the CPU resources of the system are overwhelmed by the high processing demands of the queries. This contention can lead to increased query execution times and resource bottlenecks, ultimately affecting the real-time processing capabilities of the IoT data.\n" + }, + "94": { + "start_time": "1697310552", + "end_time": "1697310612", + "start_timestamp": "2023-10-15 03:09:12", + "end_timestamp": "2023-10-15 03:10:12", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In an enterprise resource planning (ERP) system, multiple users are simultaneously performing join operations on a database table containing a large amount of data. However, the join operation is not optimized, leading to poor performance. Additionally, there is contention for CPU resources among the users, further impacting the performance of the join operation.\n", + "desc": "In the database of an e-commerce platform, suppose there is a database named 'ECommerceDB', which includes a crucial table named 'ProductInventory' for recording the inventory information of various products. This table might contain inventory data for tens of thousands or even hundreds of thousands of products. The inventory information for each product includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In this database, querying the inventory level of each product may require performing related subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query might first involve selecting all products of a particular category from the 'ProductDetails' table, then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. When the number of products is very large, the performance of these related subqueries can become inefficient. For instance, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. In such cases, due to the need to retrieve inventory information for a large number of products, the database might need to read a significant amount of data from the disk, which could lead to I/O bottlenecks.\n" + }, + "95": { + "start_time": "1697310672", + "end_time": "1697310822", + "start_timestamp": "2023-10-15 03:11:12", + "end_timestamp": "2023-10-15 03:13:42", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, this statement might be used in an e-commerce platform where the inventory levels of each product need to be determined. This requires executing subqueries that are related to each product. If these subqueries are not optimized, querying the inventory for a large number of products may result in poor performance.\n", + "desc": "In the Internet of Things (IoT) scenario, imagine a database named 'IoTDataDB' that is specifically designed to collect and analyze sensor data. This database is used to store data from various types of sensors, such as temperature, humidity, light, motion, and more. It contains a primary table called 'SensorData', which stores information about multiple sensors. Each row in this table represents a data entry from a specific sensor, and there are a total of 87 rows. The 'SensorData' table consists of 7 columns, each column capable of storing up to 24 characters. These columns may include sensor ID, sensor type, reading value, timestamp, location, status, and more.In this scenario, the database experiences performance issues when there is a large influx of data from the sensors. When all 72 sensors transmit data simultaneously, the database's ability to handle such a high volume of write requests is limited. This could be due to inefficient data partitioning, lack of buffering mechanisms, or insufficient indexing in the 'SensorData' table. As a result, the database might struggle to efficiently process all the incoming data, leading to write latency and potential anomalies in the system. These anomalies could affect the real-time analysis of sensor data and impact the overall performance of the IoT application.\n" + }, + "96": { + "start_time": "1697310882", + "end_time": "1697310953", + "start_timestamp": "2023-10-15 03:14:42", + "end_timestamp": "2023-10-15 03:15:53", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 72\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 7\n \n # Number of rows to insert\n num_rows = 87\n \n # Size of each column (in characters)\n column_size = 24\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, 72 sensors generate a large amount of data that needs to be simultaneously inserted into a database. The database table has 7 columns and 87 rows of data, with each column containing 24 characters. This simulates an exception caused by the insertion of a large amount of data into the database.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a specialized database called 'IoTDataDB' that is used to store and analyze sensor data. This database handles a large volume of data from various types of sensors, such as temperature, humidity, pressure, light, and motion. One of the key tables in this database is called 'SensorReadings', which contains information about sensor readings from 62 different sensors. Each row in the table represents a reading for a specific sensor, and there are 36 columns per row, each with a size of 65 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. The database is designed to support simultaneous data insertion operations from multiple sensors. However, when 72 sensors attempt to insert data into the 'SensorReadings' table at the same time, it can put a strain on the database's performance. The lack of proper data partitioning, buffering mechanisms, or indexing can lead to increased write latency and potential anomalies in the database. These anomalies can impact the real-time processing of sensor data and hinder the overall functionality and efficiency of the IoT system.\n" + }, + "97": { + "start_time": "1697311014", + "end_time": "1697311085", + "start_timestamp": "2023-10-15 03:16:54", + "end_timestamp": "2023-10-15 03:18:05", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 72\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 36\n \n # Number of rows to insert\n num_rows = 62\n \n # Size of each column (in characters)\n column_size = 65\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home automation system, 72 sensors generate a large amount of data that needs to be inserted into the database simultaneously. This data includes information about various aspects of the smart home, such as temperature, lighting, security, and energy consumption. Each data point is represented by 36 columns, with each column having a size of 65 characters. The data is organized into 62 rows, with each row representing a specific time period or event. This simulation helps identify any database exceptions that may occur due to the insertion of such large amounts of data.\n", + "desc": "In an internet of things (IoT) scenario, imagine a database specifically used for collecting and analyzing sensor data from various devices. This database, named 'SensorDataDB', is designed to handle a large volume of data from these sensors. One of the key tables in this database is called 'SensorReadings', which stores information about the readings from the sensors. This table contains 227 rows of data, with each row representing a specific reading from a sensor. The table has 63 columns, each containing information of up to 68 characters. These columns might include sensor ID, reading type, reading value, timestamp, sensor location, and status information. \n" + }, + "98": { + "start_time": "1697311145", + "end_time": "1697311205", + "start_timestamp": "2023-10-15 03:19:05", + "end_timestamp": "2023-10-15 03:20:05", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 174\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 63\n \n # Number of rows to insert\n num_rows = 227\n \n # Size of each column (in characters)\n column_size = 68\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online store, 174 users are simultaneously attempting to perform frequent update operations on a table containing 63 columns and 227 rows of product records, each with a column size of 68 characters. These users are competing with each other to lock the database table, resulting in database contention and potential exceptions.\n", + "desc": "In a typical IoT scenario, multiple sensors might be sending data to the database simultaneously. For example, sensors embedded in smart homes could be constantly sending temperature, humidity, and motion data. When a large number of sensors, in this case 174 sensors, send data simultaneously, it can cause contention issues in the database. This means that multiple sensors are trying to access or modify the same table or rows at the same time, effectively competing for resources. This contention can lead to performance issues, such as slow response times or even database locking. It can also affect other operations in the database, such as retrieving data or inserting new records.\n" + }, + "99": { + "start_time": "1697311265", + "end_time": "1697311352", + "start_timestamp": "2023-10-15 03:21:05", + "end_timestamp": "2023-10-15 03:22:32", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 200\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 7\n \n # Number of rows to insert\n num_rows = 3843688\n \n # Size of each column (in characters)\n column_size = 65\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database scenario, where an online store has 200 users simultaneously searching the database table containing 7 columns and 3,843,688 rows, each column with a size of 65 characters, a vacuum operation is performed on the database. This operation involves cleaning up and reorganizing the data to optimize its storage and performance. However, the large number of concurrent search operations and the vacuum process may result in a database exception or performance degradation.\n", + "desc": "In an IoT scenario, imagine there is a database used for collecting and analyzing sensor data from various devices, named 'SensorDataDB'. This database is designed to handle a large volume of data. In this case, the database administrator needs to perform a process known as 'VACUUM' on the database. This process involves optimizing the space usage by removing any unnecessary data or fragmentation. The database contains a table called 'SensorReadings', which stores data from multiple sensors. This table has a total of 3,843,688 rows, each representing a sensor reading, with 7 columns, each containing information up to 65 characters. These columns might include sensor ID, reading type, reading value, timestamp, sensor location, and status information.During the 'VACUUM' process, the administrator aims to reclaim unused space, rearrange data for better performance, and optimize disk storage usage. However, given the large number of rows in the 'SensorReadings' table and the amount of data stored, performing the 'VACUUM' operation with a high number of threads (200) can lead to potential issues, such as increased disk I/O and concurrency contention. This might impact the performance of other database operations, such as data insertion or query processing. Therefore, it is important to carefully configure and monitor the 'VACUUM' operation to avoid any anomalies or adverse effects on the database.\n" + }, + "100": { + "start_time": "1697311412", + "end_time": "1697311526", + "start_timestamp": "2023-10-15 03:23:32", + "end_timestamp": "2023-10-15 03:25:26", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 83\n \n # Number of rows to insert\n num_rows = 563820\n \n # Size of each column (in characters)\n column_size = 82\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace, 10 users simultaneously perform a query operation on a database table containing 83 columns and 563,820 rows of product records, each column having a size of 82 characters. These queries involve redundant indexes that were created at the beginning of the query and will be deleted after the operation. This process can result in additional storage requirements and performance overhead.\n", + "desc": "In the business intelligence scenario, there is a database called 'CorporateFinanceDB' that stores and processes financial data for large corporations. This database contains multiple tables, one of which is the 'FinancialRecords' table. This table holds 563,820 rows of data, each representing a financial record, with a total of 83 columns and each column able to hold up to 82 characters. These columns capture information such as the transaction ID, transaction type, amount, date, department, project code, budget code, financial year, and audit status. In this scenario, there is an anomaly related to redundant indexes. During a specific situation, 10 users are simultaneously executing complex financial queries on the 'FinancialRecords' table. To improve query efficiency, the database administrator creates multiple indexes before the queries and then removes them afterward. However, this frequent creation and deletion of indexes can consume additional storage space and cause performance overhead. It can also lead to database fragmentation, resulting in delayed generation of financial reports and affecting the decision-making process.\n" + }, + "101": { + "start_time": "1697311586", + "end_time": "1697311677", + "start_timestamp": "2023-10-15 03:26:26", + "end_timestamp": "2023-10-15 03:27:57", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing platform, multiple users are simultaneously uploading, downloading, or editing files, causing competition for input/output resources. This leads to a slowdown in the file transfer process.\n", + "desc": "In an e-commerce platform, there is a database used for storing and managing product information. This database, called 'ProductDB', contains a table named 'ProductDetails' that stores details of various products, including their names, prices, descriptions, brands, categories, and images. On a typical day, there are numerous users simultaneously adding new products or updating existing ones in the system. Additionally, users might also be downloading or viewing product images. Due to the high concurrency of these file operations, the database experiences Input/Output (I/O) contention issues. This means that the system's storage and network bandwidth are strained, leading to slower file transfer speeds. Furthermore, frequent write operations, such as adding or updating product information, can impact the overall performance of the database. In peak periods, this can cause locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "102": { + "start_time": "1697311737", + "end_time": "1697311797", + "start_timestamp": "2023-10-15 03:28:57", + "end_timestamp": "2023-10-15 03:29:57", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system used for analyzing customer data in a marketing campaign, there is a poor performance issue when performing join operations between multiple tables. This is due to the lack of proper indexing and optimization techniques. Additionally, there is a high level of contention for CPU resources caused by concurrent processes running on the same machine. This leads to slow query execution and reduced overall system performance.\n", + "desc": "In an e-commerce scenario, there is a database named 'SalesDB', which is used to store and manage sales information for an online store. This database contains multiple tables, including a key table named 'OrderDetails', which records detailed information about each customer order. Suppose this table contains thousands of orders, with each order having multiple columns such as order ID, customer ID, product ID, quantity, price, order date, and shipping information.In this scenario, the join operation between the 'OrderDetails' table and another table, such as the 'ProductInformation' table, which stores information about each product, is not optimized for performance. The join operation is used to retrieve additional information about the products included in each order, such as product name, description, and category. Due to the lack of appropriate indexing or inefficient query plans, the join operation becomes slow and resource-intensive.Moreover, during peak hours, multiple users simultaneously execute join queries to retrieve order details and related product information. As a result, the database faces CPU contention, meaning that the CPU resources are being insufficiently allocated to handle these join operations efficiently. This contention leads to slow response times for the queries, which can impact the user experience in accessing sales information.\n" + }, + "103": { + "start_time": "1697311857", + "end_time": "1697312007", + "start_timestamp": "2023-10-15 03:30:57", + "end_timestamp": "2023-10-15 03:33:27", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a retail database, fetching large amounts of inventory data for each product may involve executing related subqueries. This can lead to poor performance if the subqueries are not optimized.\n", + "desc": "In the scenario of an e-commerce platform, there is a database named 'ECommerceDB' that stores inventory information for various products. This database has a table called 'ProductInventory' where the inventory data is recorded. The table contains data for tens of thousands or even hundreds of thousands of products, including details such as product ID, stock levels, last inventory update time, supplier information, and warehouse location. In this database, there is a common query that involves retrieving the total current inventory of all products in a specific category. To perform this query, the database needs to execute subqueries on the 'ProductInventory' table for each product in the category. However, when dealing with a large number of products, this process can become inefficient. For instance, if a category has thousands of products, executing individual subqueries for each product to retrieve the inventory information would take a significant amount of time. This inefficiency is due to the need to read a large amount of data from the disk, which can lead to I/O bottlenecks in the database.\n" + }, + "104": { + "start_time": "1697312067", + "end_time": "1697312140", + "start_timestamp": "2023-10-15 03:34:27", + "end_timestamp": "2023-10-15 03:35:40", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 173\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 14\n \n # Number of rows to insert\n num_rows = 77\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, a large amount of data generated by 173 sensors needs to be inserted into the database simultaneously. This process will simulate the database exception caused by the insertion of data. The database table contains 14 columns, with each column having a size of 56 characters, and there are 77 rows of data.\n", + "desc": "In an e-commerce scenario, there is a database named 'SalesDB' that stores sales data for an online store. This database contains a key table called 'SalesRecords', which records detailed information about each sale. The table consists of 77 rows of data, with each row representing a sale record and having 14 columns. These columns may include sales ID, customer ID, product ID, quantity, price, order date, shipping address, payment method, and more. Suppose, at a specific time, 173 users simultaneously perform sales operations, such as adding new sales records or updating existing ones, on the 'SalesRecords' table. Due to the large number of concurrent write requests, the database might experience performance issues. This could be caused by factors such as insufficient buffering mechanisms, lack of optimized indexing, or ineffective transaction management. As a result, the database might struggle to handle these numerous writes, leading to increased write latency and potential anomalies in the sales records. These anomalies could manifest as incorrect or missing data in the database, affecting daily sales analysis, revenue tracking, and overall business operations.\n" + }, + "105": { + "start_time": "1697312200", + "end_time": "1697312272", + "start_timestamp": "2023-10-15 03:36:40", + "end_timestamp": "2023-10-15 03:37:52", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 173\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 36\n \n # Number of rows to insert\n num_rows = 95\n \n # Size of each column (in characters)\n column_size = 80\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) application, there are 173 devices generating a large amount of data that needs to be inserted into a database simultaneously. Each device produces data with 36 columns, each with a size of 80 characters, and there are a total of 95 rows of data. This simulation aims to test the database's performance and potential exceptions that may occur during this process.\n", + "desc": "In an e-commerce platform scenario, there is a database called 'ProductDB' used for storing product information. Within this database, there is a table named 'ProductCatalog' that contains detailed information about various products. This table consists of 95 rows, with each row representing a product entry. Each row has 36 columns, including product ID, name, description, price, stock quantity, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, and other relevant attributes. The size of each column can hold up to 80 characters.In this particular scenario, the script is triggering the INSERT_LARGE_DATA anomaly, which simulates a situation where a large volume of data is being inserted into the ProductCatalog table. The script is running with 173 threads, implying that there are 173 concurrent insert operations happening at the same time.This scenario aims to test the performance and scalability of the database in handling a high number of concurrent insert operations. It helps simulate real-world scenarios where there is a surge in product additions or updates in an e-commerce platform, such as during a new product launch or a seasonal sale. The script's execution triggers the anomaly by inserting a large amount of data, which can put stress on the database's resources and potentially lead to performance degradation or anomalies in the system.\n" + }, + "106": { + "start_time": "1697312332", + "end_time": "1697312392", + "start_timestamp": "2023-10-15 03:38:52", + "end_timestamp": "2023-10-15 03:39:52", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 71\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 61\n \n # Number of rows to insert\n num_rows = 352\n \n # Size of each column (in characters)\n column_size = 98\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online database with 61 columns and 352 rows of data records, each column having a size of 98 characters, simulate a scenario where 71 users simultaneously compete to lock the database table for performing update operations. This may cause a database exception due to contention for locking resources.\n", + "desc": "In a banking scenario, there is a database called 'AccountDB' which stores customer account information such as account number, balance, transaction history, account type, and more. In this database, there is a table named 'TransactionHistory' which records detailed information about all the transactions made by customers. This table contains 352 rows of data, with each row representing a transaction and having a total of 61 columns, each containing information of up to 98 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, and currency type.In a typical banking scenario, multiple bank staff members, automated systems, or customers may access and update the 'TransactionHistory' table simultaneously. However, if a specific event occurs where 71 users simultaneously attempt to update the same or adjacent rows in the table, it could result in a contention for locking the database table. This contention for locking the database, if it lasts for a significant duration, could lead to delayed processing or failure of other users' transaction requests. This contention for locking could potentially affect the daily banking operations and result in a negative impact on customer experience.\n" + }, + "107": { + "start_time": "1697312452", + "end_time": "1697312518", + "start_timestamp": "2023-10-15 03:40:52", + "end_timestamp": "2023-10-15 03:41:58", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 151\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 14\n \n # Number of rows to insert\n num_rows = 2355912\n \n # Size of each column (in characters)\n column_size = 80\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a busy online marketplace, 151 users are simultaneously searching the database table containing 14 columns and 2,355,912 rows of product records. Each column has a size of 80 characters. This scenario simulates the occurrence of an exception due to the lack of a necessary index after a vacuum operation, which is a large-scale data cleaning process.\n", + "desc": "In the context of an e-commerce platform, there is a database named 'OnlineStoreDB' which stores detailed information about various products. One of the key tables in this database is 'ProductDetails', which contains data about products such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes. Suppose at a certain moment, the database administrator needs to perform a vacuum operation on this table. The vacuum operation is a maintenance task that reclaims storage space occupied by deleted or updated rows, improves database performance, and optimizes disk usage. In this specific scenario, the vacuum task needs to be performed on the 'ProductDetails' table with 2,355,912 rows of data. To expedite this task, the administrator decides to use 151 threads to perform the vacuum operation. By utilizing multiple threads, the vacuum operation can be paralleled and executed concurrently, leading to faster completion. Additionally, each row in the table contains 14 columns, with each column capable of storing up to 80 characters.During the vacuum operation, the database will go through the 'ProductDetails' table and identify unused space, mark it as available for reuse, and compact the table data to improve storage efficiency. This process involves scanning and updating a large amount of data, which can have a significant impact on database performance. If the database server does not have sufficient resources or if the operation is not properly optimized, it could potentially result in anomalies such as increased response times, resource contention, or even database failures.\n" + }, + "108": { + "start_time": "1697312578", + "end_time": "1697312693", + "start_timestamp": "2023-10-15 03:42:58", + "end_timestamp": "2023-10-15 03:44:53", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 86\n \n # Number of rows to insert\n num_rows = 445743\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial database with 86 columns and 445,743 rows, each with a column size of 54 characters, a large number of indexes are created for various financial metrics at the beginning of the query. Eight users then perform queries on the database, causing additional storage and performance overhead due to the redundant indexes.\n", + "desc": "In an e-commerce scenario, imagine a database called 'OnlineStoreDB', which stores various information about products for an online store. This database contains a key table named 'ProductDetails' that records detailed information about each product. The table consists of 445,743 rows of data, with each row representing a specific product entry. It has a total of 86 columns, with each column capable of storing up to 54 characters of information. These columns include attributes such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related information. In this particular situation, the database administrator has created multiple redundant indexes on the 'ProductDetails' table. These redundant indexes are unnecessary and might have been created for future query acceleration or other purposes. While they can provide some benefits for specific queries, they consume additional storage space and increase the overhead for data modification operations. This can result in slower query performance, increased disk I/O operations, and additional maintenance overhead for the database. In the context of this scenario, the presence of redundant indexes might slow down the overall performance of the database, impacting the efficiency of various operations such as inserting, updating, deleting, or querying product data.\n" + }, + "109": { + "start_time": "1697312753", + "end_time": "1697312844", + "start_timestamp": "2023-10-15 03:45:53", + "end_timestamp": "2023-10-15 03:47:24", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "This script simulates a scenario in a file sharing system where multiple users are uploading, downloading, or editing files at the same time. As a result, there is I/O contention, which causes a slowdown in file transfers.\n", + "desc": "In an e-commerce platform, there is a database specifically used for an online store. This database contains a table called 'ProductRecords' which holds detailed information about various products. The table consists of 200,000 rows of data, with each row representing a different product. There are 20 columns in the table, each containing information up to 100 characters. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes. In this particular scenario, there is a need to insert a large amount of data into the database while multiple users are simultaneously accessing and using the system. This can cause I/O contention, where the storage and network bandwidth of the system become strained due to the high concurrency of file operations. As a result, the file transfer speeds may slow down, impacting the overall performance of the system. Additionally, frequent write operations in the database might cause locking and transaction management issues, further slowing down file processing and metadata recording.In the database of an e-commerce platform, there is a database specifically used for collecting and analyzing sales data, named 'SalesDataDB'. This database contains multiple tables, including a table named 'OrderDetails', which records detailed information about customer orders. Another table named 'ProductInventory' stores the inventory information of various products. The 'OrderDetails' table consists of thousands of rows of data, each representing an individual order, with columns such as order ID, customer ID, product ID, quantity, price, and order date. The 'ProductInventory' table contains inventory data for thousands of products, including product ID, current stock level, supplier information, and location. \n" + }, + "110": { + "start_time": "1697312904", + "end_time": "1697312964", + "start_timestamp": "2023-10-15 03:48:24", + "end_timestamp": "2023-10-15 03:49:24", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis system, multiple users are executing a join operation that involves multiple tables with poor join performance. At the same time, there is a high level of competition for CPU resources, which further degrades the performance of the join operation.\n", + "desc": "In the e-commerce platform, frequent product inventory queries or order analysis might require joining the 'OrderDetails' and 'ProductInventory' tables based on the product ID. However, due to poor join performance, the database might experience challenges in efficiently executing join operations on these large tables. This poor performance could be caused by factors such as inefficient indexing, lack of appropriate query optimization, or high CPU contention. These factors can lead to increased CPU usage and contention among concurrent queries, resulting in slower join execution times and overall degradation in database performance. As a consequence, the system might experience delays in generating reports or responding to customer queries related to order details and inventory information. Moreover, this poor join performance can directly impact the user experience on the e-commerce platform, leading to potential customer dissatisfaction and loss of business.\n" + }, + "111": { + "start_time": "1697313024", + "end_time": "1697313174", + "start_timestamp": "2023-10-15 03:50:24", + "end_timestamp": "2023-10-15 03:52:54", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail system, when retrieving inventory data for each product, related subqueries are used. However, if these subqueries are not optimized and there is a large number of products, the performance of the inventory query may be negatively affected.\n", + "desc": "In an e-commerce platform's database, there is a table named 'ProductInventory' that stores information about the inventory of various products. This table contains data for tens of thousands or even hundreds of thousands of products. For each product, the inventory includes details such as the product ID, current stock level, last update time, supplier ID, and warehouse location. When querying the inventory level of products, particularly when trying to determine the total inventory of all products within a specific category, the database might need to perform correlated subqueries. This involves selecting products of a certain category and then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for those products. However, if the category contains a large number of products, executing individual subqueries for each product can be time-consuming and lead to inefficiency. This is because retrieving inventory information for a large number of products may require reading a significant amount of data from the disk, which can result in I/O bottlenecks.\n" + }, + "112": { + "start_time": "1697313234", + "end_time": "1697313305", + "start_timestamp": "2023-10-15 03:53:54", + "end_timestamp": "2023-10-15 03:55:05", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 70\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 51\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 70 devices are simultaneously sending a large amount of data to be inserted into the database. These devices have 10 different types of data and each data type has a size of 70 characters. The number of data rows to be inserted is 51.\n", + "desc": "In an e-commerce database for an online store, a large-scale data insertion operation is being performed. The database consists of multiple tables, one of which is called 'ProductData', which stores detailed information about various products. The data insertion involves inserting 51 rows of data into the 'ProductData' table. Each row represents a separate product entry and contains 10 columns, each with a size of up to 70 characters. These columns include product ID, name, price, stock quantity, description, brand, category, weight, color, and production date. The insertion operation is being executed by multiple threads, specifically 70 threads. This operation might potentially result in performance issues in the database, such as increased write latency or database locking. These issues can arise due to various factors, such as insufficient buffering mechanisms, improper indexing, or inadequate data partitioning strategies. The performance impact could affect other database operations and potentially lead to anomalies in the e-commerce platform's overall functionality.\n" + }, + "113": { + "start_time": "1697313366", + "end_time": "1697313437", + "start_timestamp": "2023-10-15 03:56:06", + "end_timestamp": "2023-10-15 03:57:17", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 70\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 25\n \n # Number of rows to insert\n num_rows = 82\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 70 sensors generate a large amount of data simultaneously. This data needs to be inserted into the database, which may lead to a database exception. The data consists of 25 columns, with each column having a size of 63 characters. There are a total of 82 rows in the data.\n", + "desc": "In a business intelligence scenario, there is a database named 'BusinessIntelligenceDB' that is used for storing and analyzing data related to business operations. The database contains a key table called 'DataRecords' which stores various types of data such as sales figures, revenue, expenses, and customer information. This table consists of 82 rows of data, each representing a specific data record, and it has 25 columns, each with a size of 63 characters. The columns in this table include data ID, data type, value, date, department, region, product category, and other relevant information.In this particular scenario, there is an anomaly triggered by the insertion of a large volume of data into the 'DataRecords' table. This anomaly is caused by the lack of efficient indexing or partitioning mechanisms in the database to handle such a large-scale data insertion. As a result, when 70 threads simultaneously attempt to insert data into the table, the database's performance is significantly affected. The database may experience delays in processing the insertion requests, resulting in increased latency and potentially impacting other database operations.In a business intelligence environment, where timely data analysis and reporting are crucial for decision-making, such anomalies can hinder the efficiency of data processing and analysis. Therefore, it is essential to optimize the database's indexing and partitioning strategies to handle large-scale data insertions effectively.\n" + }, + "114": { + "start_time": "1697313497", + "end_time": "1697313557", + "start_timestamp": "2023-10-15 03:58:17", + "end_timestamp": "2023-10-15 03:59:17", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 70\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 83\n \n # Number of rows to insert\n num_rows = 228\n \n # Size of each column (in characters)\n column_size = 91\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for an online store, 70 users simultaneously attempt to perform frequent update operations on a database table containing 83 columns and 228 rows of product records, each with a column size of 91 characters. These users compete with each other to lock the database table, resulting in contention and potentially causing database exceptions.\n", + "desc": "In the e-commerce scenario, there is a database named 'OnlineStoreDB' for an online store. This database contains a key table called 'ProductRecords' that stores detailed information about products. The table consists of 228 rows, each representing a specific product, with 83 columns. These columns contain information such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other attributes. In this particular situation, there are 70 users simultaneously accessing the online store, potentially causing contention for database locks. The high number of concurrent threads and the size of the columns and rows could result in performance issues, such as delayed processing or failure of users' requests. This could impact the smooth operation of the online store and lead to anomalies in the database.\n" + }, + "115": { + "start_time": "1697313617", + "end_time": "1697313709", + "start_timestamp": "2023-10-15 04:00:17", + "end_timestamp": "2023-10-15 04:01:49", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 178\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 3628154\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 178 users are simultaneously performing searches after a large-scale data cleaning operation on a database table that contains 10 columns and 3,628,154 rows. Each column has a size of 63 characters. This process simulates the potential exception that can occur due to the search operation.\n", + "desc": "In the database of an e-commerce platform, hypothetically called 'ECommerceDB', there is a table named 'ProductInventory' that stores information about the inventory of various products. This table contains 3,628,154 rows of data, each representing an individual product entry. The table has 10 columns, with each column capable of storing up to 63 characters of information. These columns may include product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other relevant details.In a specific scenario, 178 users simultaneously trigger a vacuum operation on the 'ProductInventory' table. The vacuum operation is a database maintenance task that reclaims unused space and optimizes table performance. It typically involves physically reorganizing the table's structure and rewriting its data to improve storage efficiency.However, due to the large number of concurrent users and the size of the 'ProductInventory' table, this vacuum operation could potentially lead to performance issues and anomalies. The database may experience resource contention, such as increased disk I/O usage and high CPU utilization, resulting in slower response times and potential disruptions in other database operations.To address this situation, it is advisable to carefully plan and coordinate the vacuum operation to minimize its impact on the overall database performance. This may involve scheduling the operation during periods of low user traffic, implementing incremental vacuuming techniques, or utilizing resources such as parallel processing to expedite the operation while minimizing disruptions to other database activities.\n" + }, + "116": { + "start_time": "1697313770", + "end_time": "1697313883", + "start_timestamp": "2023-10-15 04:02:50", + "end_timestamp": "2023-10-15 04:04:43", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 83\n \n # Number of rows to insert\n num_rows = 532309\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database of an e-commerce platform with 83 columns and 532,309 rows, each with a column size of 100 characters, a large number of redundant indexes are created for attributes such as product name, category, and price range. These redundant indexes can cause additional storage overhead and impact the performance of query operations.\n", + "desc": "In a business intelligence scenario, particularly relating to financial analysis, there is a database called 'CorporateFinanceDB' that stores and processes financial data for large corporations. Within this database, there are multiple tables, one of which is a key table called 'FinancialRecords' that records various financial transactions and statement information. This table contains 532,309 rows of data, with each row representing a financial record. It has 83 columns, each containing information of up to 100 characters, such as transaction ID, transaction type, amount, date, department, project code, budget code, financial year, and audit status. In this scenario, the database administrator needs to optimize the database for complex financial queries. To enhance the efficiency of these queries, the administrator creates redundant indexes based on various criteria such as transaction type, date range, department, and project code. However, the creation of these redundant indexes can lead to additional storage usage and performance overhead in the database. It may also cause database fragmentation, which affects performance and can result in delayed report generation, impacting the decision-making process.\n" + }, + "117": { + "start_time": "1697313943", + "end_time": "1697314034", + "start_timestamp": "2023-10-15 04:05:43", + "end_timestamp": "2023-10-15 04:07:14", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, if there is a high demand for uploading, downloading, or editing files simultaneously, it can cause contention in the input/output (I/O) operations. This can result in slower file transfer speeds and decreased performance.\n", + "desc": "In a file sharing system, let's consider a database called 'TeamFileShareDB'. This database is used by teams or organizations to share files. It stores both the files themselves and their metadata, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a regular workday, multiple users are likely to simultaneously upload, download, or edit files. For example, a project team might collaborate on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. The system may also handle large file storage and sharing, such as presentations, video conference recordings, or design drawings. Due to high concurrency in file operations, the 'TeamFileShareDB' database faces challenges of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth can become strained. This I/O contention may result in slower file transfer speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Moreover, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "118": { + "start_time": "1697314094", + "end_time": "1697314154", + "start_timestamp": "2023-10-15 04:08:14", + "end_timestamp": "2023-10-15 04:09:14", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a busy office environment, multiple employees simultaneously execute resource-intensive join operations on a large database table with poor join performance. This results in excessive CPU usage, causing contention among the employees and slowing down their tasks.\n", + "desc": "In the business intelligence scenario, let's consider a database called 'BIAnalyticsDB', which is used for performing data analysis and generating reports for a large organization. This database contains multiple tables, including a key table named 'SalesData', which stores sales information for different products and regions. The 'SalesData' table consists of various columns such as product ID, region ID, sales amount, sales date, customer information, and more. Suppose, during a peak analysis period, multiple users are executing complex join queries on the 'SalesData' table to generate sales reports based on different criteria, such as product category, region, or time frame. However, due to design and performance issues, the database might face poor join performance. Inefficient query plans, lack of appropriate indexes, or improper join conditions could contribute to this performance issue.As a result, executing these join queries takes a long time, leading to slow report generation and impacting the productivity of the analysis team. Moreover, during peak analysis periods, when many users are simultaneously executing join queries, the database might also face CPU contention issues. The increased CPU usage to process these queries can put a strain on the available processing power and cause delays or disruptions in other database operations.Overall, the poor join performance and CPU contention in the 'BIAnalyticsDB' database when executing complex join queries could hinder efficient data analysis and reporting, affecting the decision-making process within the organization.\n" + }, + "119": { + "start_time": "1697314214", + "end_time": "1697314365", + "start_timestamp": "2023-10-15 04:10:14", + "end_timestamp": "2023-10-15 04:12:45", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online store's database, there is a scenario where the system needs to fetch a large amount of data and perform correlated subqueries. For example, when trying to calculate the inventory of each product, the system might need to run subqueries to retrieve related data. If not optimized properly, this process can lead to a decrease in query performance.\n", + "desc": "In the scenario of an e-commerce platform, there is a database named 'ECommerceDB' that stores various information about products. One important table in this database is called 'ProductInventory', which contains inventory data for tens of thousands or even hundreds of thousands of products. This inventory data includes details such as product ID, current stock level, last inventory update time, supplier ID, and warehouse location. When performing certain queries, such as determining the total inventory of products within a specific category, related subqueries need to be executed. These subqueries involve selecting products of a particular category from the 'ProductDetails' table and retrieving their inventory information from the 'ProductInventory' table. However, when there is a large number of products involved in the query, the performance of these related subqueries can become inefficient. This inefficiency occurs because executing individual subqueries for each product to retrieve inventory information can be time-consuming. As a result, the database may need to read a significant amount of data from the disk, leading to potential I/O bottlenecks.\n" + }, + "120": { + "start_time": "1697314425", + "end_time": "1697314496", + "start_timestamp": "2023-10-15 04:13:45", + "end_timestamp": "2023-10-15 04:14:56", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 53\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 15\n \n # Number of rows to insert\n num_rows = 50\n \n # Size of each column (in characters)\n column_size = 31\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, 53 sensors are generating a large amount of data that needs to be inserted into a database. Each data entry consists of 15 columns, with each column having a size of 31 characters. In total, there are 50 data entries. This process may cause an exception in the database due to the simultaneous insertion of a large amount of data.\n", + "desc": "In the Internet of Things (IoT) domain, consider a database named 'DeviceDataDB', which is specifically designed to handle data from various IoT devices. This database contains a key table called 'DeviceReadings' that stores the readings and measurements from these devices. Each row in this table represents a specific measurement at a certain time, and there are a total of 50 rows. The table consists of 15 columns, each containing data of up to 31 characters, including device ID, sensor type, reading value, timestamp, device location, status, and other relevant information.In this scenario, the database is facing an issue related to the insertion of a large amount of data. The administrator is trying to simulate the scenario by executing the provided script, which triggers the anomaly of inserting large data. The script specifies that 53 threads will be used to insert the data, which means that 53 concurrent insertion operations will be carried out simultaneously.Without proper optimization measures, such as batching the insert operations or designing efficient data insertion algorithms, this large-scale data insertion can lead to performance issues in the database. This might include increased write latency, database locking, or even database server crashes, affecting the overall functionality and reliability of the IoT system.\n" + }, + "121": { + "start_time": "1697314556", + "end_time": "1697314627", + "start_timestamp": "2023-10-15 04:15:56", + "end_timestamp": "2023-10-15 04:17:07", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 53\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 30\n \n # Number of rows to insert\n num_rows = 55\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a real-life scenario, there is a data-intensive application where 53 users simultaneously insert a large amount of data into a database table. The table contains 30 columns, each with a size of 99 characters, and there are 55 rows of data. This simulation aims to test the performance and stability of the database under such conditions.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically designed for collecting and analyzing sensor data. This database is called 'SensorDataDB' and it is used to handle a large volume of data from various types of sensors. One of the primary tables in this database is called 'SensorReadings', which stores data from the sensors. Each row in this table represents a single reading from a sensor and there are a total of 55 rows of data. The table consists of 30 columns, each containing information of up to 99 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this particular scenario, there is a need to insert a large amount of data into the 'SensorReadings' table. This insertion operation is being performed with 53 concurrent threads, which means that 53 separate processes are simultaneously inserting data into the table. This can result in a high concurrency situation and may lead to performance issues in the database. Without proper optimization and indexing, the database may struggle to handle the simultaneous insertion of such a large amount of data. This can cause delays in the insertion process and may even lead to anomalies in the database. For example, the database may experience slower performance, increased locking, or even failures in the insertion of the large data.\n" + }, + "122": { + "start_time": "1697314687", + "end_time": "1697314747", + "start_timestamp": "2023-10-15 04:18:07", + "end_timestamp": "2023-10-15 04:19:07", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 96\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 80\n \n # Number of rows to insert\n num_rows = 314\n \n # Size of each column (in characters)\n column_size = 55\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for an online store, 96 users simultaneously attempt to perform frequent update operations on a database table with 80 columns and 314 rows of product records, each with a column size of 55 characters. These users compete with each other to lock the database table and perform the update operations, simulating a database exception caused by the contention for locks.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database called 'IoTDataDB' that is used for storing and processing data from various sensors. This database contains a key table named 'SensorReadings' that records detailed information about sensor readings. Each row in this table represents a reading from a sensor and includes information such as sensor ID, reading type, reading value, timestamp, location, and status. In this scenario, there are 314 rows of data in the table, each containing 80 columns of information, with each column able to store up to 55 characters. Now, suppose there are 96 sensors that are simultaneously transmitting data at a high frequency. This high concurrency of data transmission can cause contention issues in the database. Due to the design of the table and the database's locking mechanism, multiple sensors trying to write data to the same or adjacent rows can lead to competition for locking the database table. This contention for locking resources can result in performance issues, such as delayed processing or failure of other sensors' data transmission requests. In an IoT environment, such locking contention can impact the real-time processing of sensor data and the overall efficiency of the system.\n" + }, + "123": { + "start_time": "1697314807", + "end_time": "1697314859", + "start_timestamp": "2023-10-15 04:20:07", + "end_timestamp": "2023-10-15 04:20:59", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 162\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 15\n \n # Number of rows to insert\n num_rows = 3400337\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system that supports an online store, there are 162 users simultaneously searching for products using various criteria such as product name, category, and price range. The search is performed on a table with 15 columns and 3,400,337 rows, where each column can hold up to 99 characters. In this scenario, the search may encounter an exception due to the lack of optimization or resource limitations in the database system.\n", + "desc": "In a business intelligence scenario, there is a database named 'SalesAnalyticsDB' that stores sales data for a large company. The database contains a key table called 'SalesRecords' which records detailed information about each sale, including information such as transaction ID, product ID, customer ID, sales date, sales revenue, and other relevant data. This table consists of 3,400,337 rows, with each row representing a specific sales transaction. There are a total of 15 columns in the table, and each column can contain up to 99 characters. In order to optimize the performance of the database and ensure efficient query processing, regular maintenance operations need to be performed. One of these operations is vacuuming, which helps in reclaiming storage space and optimizing the organization of data within the table. Vacuuming involves removing any unused or outdated data, reorganizing the table structure, and freeing up space that was previously occupied by deleted or updated records. In this particular scenario, the script is specifying to perform a vacuum operation on the 'SalesRecords' table with a high level of concurrency. It is using 162 threads to speed up the vacuuming process. By performing this vacuum operation, the database administrator aims to improve the overall performance of the database by reducing storage fragmentation and optimizing the allocation of storage space.\n" + }, + "124": { + "start_time": "1697314919", + "end_time": "1697315033", + "start_timestamp": "2023-10-15 04:21:59", + "end_timestamp": "2023-10-15 04:23:53", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 78\n \n # Number of rows to insert\n num_rows = 512563\n \n # Size of each column (in characters)\n column_size = 80\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a large-scale e-commerce database, multiple users are performing queries on a table with 78 columns and 512,563 rows of product records, each with a column size of 80 characters. However, there is a performance issue caused by the creation of redundant indexes for various search parameters such as product name, category, and price range. This impacts both the storage footprint and query performance.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database named 'SensorDataDB', used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in this database is the 'SensorReadings' table, which stores information about sensor readings. This table contains 512,563 rows of data, representing readings from different sensors, with a total of 78 columns. Each column can store information of up to 80 characters.Due to the nature of sensor data and the need for efficient analysis, the database administrator might have created redundant indexes for optimizing query performance. These indexes could have been created based on various factors such as sensor type, reading values, or timestamps. However, the presence of these redundant indexes can lead to additional storage usage and overhead in the database.Furthermore, frequent index operations can cause database fragmentation, affecting overall performance. When multiple users simultaneously query or analyze sensor data, especially during peak periods, the presence of redundant indexes might slow down the execution of these queries. This can result in delayed data retrieval, impacting the efficiency of the analysis process in the IoT environment.\n" + }, + "125": { + "start_time": "1697315093", + "end_time": "1697315183", + "start_timestamp": "2023-10-15 04:24:53", + "end_timestamp": "2023-10-15 04:26:23", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are simultaneously uploading, downloading, or editing files, there is intense competition for I/O resources. As a result, the file transfer process slows down due to the increased congestion and contention for access to the underlying storage system.\n", + "desc": "In a file sharing system, there is a database called 'FileShareDB' that allows users to upload, download, and edit files. This database stores the files themselves as well as metadata such as file size, uploader information, creation and modification dates, access permissions, and download counts. During normal usage, multiple users may be simultaneously performing file operations, such as uploading or downloading files. This includes handling large files such as presentations, video recordings, or design drawings. However, due to the high concurrency of these file operations, the system may experience I/O contention. This means that the storage and network bandwidth can become strained when multiple users are uploading or downloading large files. As a result, file transfer speeds may be slower, especially during peak periods when server processing capabilities or bandwidth are limited. Additionally, frequent write operations in the database, such as file uploads and metadata updates, may impact overall database performance. This can lead to issues like locking or transaction management problems, further slowing down file processing and metadata recording.\n" + }, + "126": { + "start_time": "1697315243", + "end_time": "1697315304", + "start_timestamp": "2023-10-15 04:27:23", + "end_timestamp": "2023-10-15 04:28:24", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a financial system that processes large amounts of transaction data, there is a performance issue when performing a join operation to combine data from multiple tables. Additionally, there is CPU contention as multiple users are simultaneously executing complex calculations and algorithms, causing a slowdown in overall system performance.\n", + "desc": "In a business intelligence scenario, particularly in the database of a large corporation named 'CorporateAnalyticsDB', there is a need to perform complex join queries across multiple tables to generate comprehensive business reports. The tables in this database contain various business data, such as sales records, customer profiles, product catalog, and market trends. However, due to the lack of effective indexing of join keys or improper query optimization, these join operations can be slow and resource-intensive. During peak periods, when multiple complex join queries are executed simultaneously, there might be competition for CPU resources, leading to reduced query efficiency and performance. This CPU contention can be caused by either a large number of compute-intensive queries running on the database server or inadequate CPU resources to handle the workload.\n" + }, + "127": { + "start_time": "1697315364", + "end_time": "1697315514", + "start_timestamp": "2023-10-15 04:29:24", + "end_timestamp": "2023-10-15 04:31:54", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online store's inventory management system, fetching large amounts of data from the database and executing correlated subqueries to find the quantity of inventory for each product can lead to performance issues. This script simulates the database exception caused by this process.\n", + "desc": "In the context of an e-commerce scenario, suppose there is a database called 'ECommerceDB' that stores information about products, including inventory data. Within this database, there is a table named 'ProductInventory' which contains inventory information for thousands or even hundreds of thousands of products. Each product's inventory entry includes details such as product ID, current stock level, last inventory update time, supplier ID, and warehouse location. In this database, when a query is executed to fetch the inventory data of each product, it might involve performing related subqueries. For example, a common query could be to determine the total current inventory of all products within a specific category. To obtain this information, the database would need to retrieve the relevant products from the 'ProductDetails' table and then perform subqueries on the 'ProductInventory' table to fetch the corresponding inventory data. However, when there is a large number of products within the category, executing individual subqueries for each product can result in inefficient performance. This is because fetching inventory information for a large number of products requires reading a significant amount of data from the disk, leading to potential I/O bottlenecks.\n" + }, + "128": { + "start_time": "1697315574", + "end_time": "1697315645", + "start_timestamp": "2023-10-15 04:32:54", + "end_timestamp": "2023-10-15 04:34:05", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 65\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 79\n \n # Size of each column (in characters)\n column_size = 60\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a sensor data collection system, 65 sensors are generating a large amount of data simultaneously. This data needs to be inserted into a database table with 9 columns and 79 rows. Each column has a size of 60 characters. This process may cause a database exception due to the high volume of data being inserted.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. There is a primary table called 'SensorData', which contains information such as sensor ID, reading type, reading value, timestamp, and sensor location. In this scenario, due to the simultaneous transmission of data from 65 sensors at a very high frequency, the database might encounter performance issues. This can be attributed to the lack of effective data partitioning in the 'SensorData' table or insufficient buffering mechanisms. Additionally, if the database is not properly optimized or lack necessary indexes, it might struggle to process these numerous concurrent write requests efficiently. This can result in increased write latency and may even lead to database locking, causing anomalies in the database. The anomalies can not only affect the performance of the database but also impact the reliability and accuracy of the collected sensor data.\n" + }, + "129": { + "start_time": "1697315706", + "end_time": "1697315777", + "start_timestamp": "2023-10-15 04:35:06", + "end_timestamp": "2023-10-15 04:36:17", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 65\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 30\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 91\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a scientific research project, 65 sensors are collecting data simultaneously and trying to insert it into the database. Each data point contains 30 columns with a size of 91 characters, and there are a total of 68 data points. This simulates the scenario where a large amount of data is being inserted into the database, causing a potential exception due to the high volume of concurrent insertions.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data. The database is called 'SensorDataDB' and it is designed to handle a large amount of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which stores information about different sensor readings. Each row in the table represents a specific reading and there are a total of 68 rows of data. The table has 30 columns, each containing information of up to 91 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, the database is experiencing performance issues due to the lack of proper optimization for handling large-scale data insertions. When 65 users simultaneously insert large amounts of data into the 'SensorReadings' table, the database struggles to process these write requests efficiently. This can result in increased write latency and potentially lead to anomalies in the database. These anomalies can impact the overall performance and reliability of the IoT system.\n" + }, + "130": { + "start_time": "1697315837", + "end_time": "1697315897", + "start_timestamp": "2023-10-15 04:37:17", + "end_timestamp": "2023-10-15 04:38:17", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 178\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 89\n \n # Number of rows to insert\n num_rows = 289\n \n # Size of each column (in characters)\n column_size = 90\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online marketplace, there are 178 users simultaneously competing to update product records. The database table contains 89 columns and 289 rows, with each column having a size of 90 characters. This process simulates a scenario where multiple users are contending for locks on the database table, causing contention and potential exceptions.\n", + "desc": "In a live developed Internet of Things (IoT) setting, there is a database that contains a massive amount of sensor-generated data. This database is specifically designed for the purpose of collecting and analyzing sensor data and is called 'SensorDataDB'. The primary table within this database is called 'SensorReadings', which stores information from various types of sensors. Each row in this table represents a reading from a specific sensor, and there are 289 rows of data in total. The table consists of 89 columns, each with a size of 90 characters, where each column represents a different attribute such as sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this particular setting, there is a scenario where there is a high level of contention for locking the 'SensorReadings' table. This occurs when 178 threads, representing various processes or devices accessing the database, attempt to access or update the same or adjacent rows in the table simultaneously. Due to the database's locking mechanism, this contention for locking can lead to performance issues and delays in processing requests. The duration and severity of this contention can greatly impact the efficiency and functionality of the overall IoT system.\n" + }, + "131": { + "start_time": "1697315957", + "end_time": "1697316024", + "start_timestamp": "2023-10-15 04:39:17", + "end_timestamp": "2023-10-15 04:40:24", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 161\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 2927750\n \n # Size of each column (in characters)\n column_size = 91\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for managing customer orders in an e-commerce platform, there is a need to periodically clean up and optimize the database table. However, if 161 users simultaneously perform searches on a table with 9 columns and 2,927,750 rows of order records, with each column containing 91 characters, it may cause a database exception due to the ongoing vacuuming process.\n", + "desc": "In a file sharing system scenario, there is a database called 'TeamFileShareDB' that is used by teams or organizations for sharing files. This database stores both the files themselves and metadata about the files, such as uploader information, file size, dates of creation and modification, version history, access permissions, and download counts. During peak periods, when the system experiences high concurrency in file operations, such as multiple users uploading or downloading files simultaneously, the database may encounter performance issues related to I/O (input/output) contention. This occurs when the system's storage and network bandwidth are strained by the high volume of file transfer requests. As a result, file transfer speeds may slow down, especially when there is limited bandwidth or insufficient server processing capabilities. Additionally, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. This can lead to issues with locking and transaction management, further slowing down file processing and metadata recording.\n" + }, + "132": { + "start_time": "1697316084", + "end_time": "1697316198", + "start_timestamp": "2023-10-15 04:41:24", + "end_timestamp": "2023-10-15 04:43:18", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 74\n \n # Number of rows to insert\n num_rows = 746668\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial database with 74 columns and 746,668 rows, each with a column size of 70 characters, multiple indexes are created for fields such as account number, transaction date, and amount at the beginning of a query. Five users then perform a query operation, and these indexes are removed after the query. This simulates the increased storage usage and performance impact caused by having redundant indexes in the database.\n", + "desc": "In an IoT scenario, there is a database designed to collect and analyze sensor data. This database is called 'SensorDataDB' and it handles a large volume of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which stores information about sensor readings. This table contains 746,668 rows of data, each representing a reading from a sensor. There are 74 columns in this table, each with a size of 70 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, the database faces a problem caused by redundant index creation. Redundant indexes are created for the purpose of accelerating queries related to sensor data analysis, such as analyzing temperature trends over time or identifying sensors with abnormal readings. However, the frequent creation of redundant indexes can lead to additional storage usage and performance overhead in the database. It can also cause database fragmentation, which further affects performance. This can result in delayed generation of analysis reports and affect the efficiency of decision-making processes in IoT applications.\n" + }, + "133": { + "start_time": "1697316258", + "end_time": "1697316349", + "start_timestamp": "2023-10-15 04:44:18", + "end_timestamp": "2023-10-15 04:45:49", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a collaborative document editing tool, multiple users are simultaneously uploading, downloading, or editing files, causing contention in the input/output operations of the file system. This results in slower file transfers and editing processes.\n", + "desc": "In a file transfer system scenario, there is a database called 'FileShareDB' which is used by teams or organizations for sharing files. This database stores both the files themselves and the metadata related to the files such as uploader information, size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users simultaneously upload, download, or edit files. This includes scenarios where a project team collaborates on an important report, with team members frequently uploading the latest version of files, while others download them for viewing or editing. Additionally, the system is used for storing and sharing large files like presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the database faces challenges of input/output (I/O) contention. When multiple users simultaneously upload or download large files, it strains the system's storage and network bandwidth. This I/O contention leads to slower file transfer speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact the performance of the database. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and the recording of metadata.\n" + }, + "134": { + "start_time": "1697316409", + "end_time": "1697316470", + "start_timestamp": "2023-10-15 04:46:49", + "end_timestamp": "2023-10-15 04:47:50", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database used by an online gaming platform, there is a performance issue when executing a join operation due to poor optimization. This operation involves joining multiple tables related to player data. Additionally, there is CPU contention caused by multiple players actively using the gaming platform at the same time, which further affects the overall system performance.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB' used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. The database administrators frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. However, due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. This is because the join keys in these tables might not be effectively indexed, and the queries might not be properly optimized. As a result, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there can be competition for CPU resources, further reducing query efficiency. This CPU contention might occur because there are too many compute-intensive queries running on the database server or because the server's CPU resources are insufficient to handle these queries.\n" + }, + "135": { + "start_time": "1697316530", + "end_time": "1697316680", + "start_timestamp": "2023-10-15 04:48:50", + "end_timestamp": "2023-10-15 04:51:20", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, there is a scenario where a large amount of data needs to be fetched while executing correlated subqueries. This can result in reduced performance while querying the inventory if the related subqueries are not optimized.\n", + "desc": "In an e-commerce platform scenario, there is a database called 'ECommerceDB' that stores various product information. This database includes a table named 'ProductInventory' which records the inventory details of different products. Each product's inventory information contains the product ID, stock level, last update time, supplier ID, warehouse location, and other relevant details. In this database, there is a frequent need to query the inventory levels of products, often involving correlated subqueries. For example, querying the total inventory of products within a specific category might require selecting all products from the 'ProductDetails' table and then performing subqueries on the 'ProductInventory' table to retrieve the inventory information. However, when dealing with a large number of products, such subqueries can lead to inefficient performance. This is especially true when a category has thousands of products, as retrieving inventory information for each product individually can be time-consuming. As a result, the database might need to read a significant amount of data from the disk, leading to potential I/O bottlenecks.\n" + }, + "136": { + "start_time": "1697316740", + "end_time": "1697316812", + "start_timestamp": "2023-10-15 04:52:20", + "end_timestamp": "2023-10-15 04:53:32", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 178\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 60\n \n # Size of each column (in characters)\n column_size = 27\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) system, 178 sensors generate a large amount of data to be inserted into a database simultaneously. Each data record contains 18 columns, with each column having a size of 27 characters. There are a total of 60 data records. This script simulates the database exception that can occur due to the high volume of data being inserted at once.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically used for collecting and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from 60 sensors. These fields may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, sensor location, and status information.When all 60 sensors start transmitting data simultaneously, the database might encounter performance issues. Due to the lack of effective data partitioning in the 'SensorReadings' table, insufficient buffering mechanisms, or improper indexing, the database's ability to process these numerous concurrent write requests is limited. This can lead to increased write latency in the database and, in some cases, may even result in database locking, ultimately leading to anomalies.\n" + }, + "137": { + "start_time": "1697316872", + "end_time": "1697316945", + "start_timestamp": "2023-10-15 04:54:32", + "end_timestamp": "2023-10-15 04:55:45", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 178\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 25\n \n # Number of rows to insert\n num_rows = 56\n \n # Size of each column (in characters)\n column_size = 81\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, 178 threads are used to simultaneously insert a large amount of data into a database table. Each row in the table contains 25 columns, with each column having a size of 81 characters. The table contains a total of 56 rows. This process is designed to simulate the database exception that occurs when a high volume of data is being inserted into the database.\n", + "desc": "In the Internet of Things (IoT) scenario, imagine a database specifically designed for storing and analyzing sensor data in a smart home environment. This database, named 'SensorDataDB', is responsible for collecting data from various types of sensors and processing it for further analysis. One of the key tables in this database is called 'SensorReadings', which contains detailed information about the sensor readings, such as temperature, humidity, motion, light intensity, and more. Each row in this table represents a unique sensor reading entry, and there are a total of 56 such entries. The 'SensorReadings' table consists of 25 columns, each of which can store up to 81 characters of data. These columns may include sensor ID, reading type, reading value, timestamp, location, and other relevant information. In this scenario, the database is facing the challenge of handling a large amount of data that is being inserted simultaneously. This is due to the fact that there are 178 sensor devices deployed in the smart home environment, and all of them are transmitting data to the database simultaneously. As a result, the database might encounter performance issues, as it needs to process and store this large volume of incoming data in a timely manner. Without proper optimization measures, such as efficient buffering mechanisms, appropriate data partitioning, or optimized indexing, the database's ability to handle this large influx of data could be compromised. This might lead to increased write latency and even database locking, ultimately resulting in anomalies in the system.\n" + }, + "138": { + "start_time": "1697317005", + "end_time": "1697317065", + "start_timestamp": "2023-10-15 04:56:45", + "end_timestamp": "2023-10-15 04:57:45", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 84\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 87\n \n # Number of rows to insert\n num_rows = 226\n \n # Size of each column (in characters)\n column_size = 79\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online marketplace, 84 users simultaneously attempt to perform frequent update operations on a database table containing 87 columns and 226 rows of product records. Each column has a size of 79 characters. Due to the high number of users competing to lock the database table, a database exception occurs as they try to perform the update operations.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database designed for collecting and analyzing sensor data, named 'SensorDataDB'. This database is used to store data from various sensors and consists of multiple tables. One of the key tables in this database is called 'SensorReadings', which records readings from different sensors. Each row in this table represents a reading from a specific sensor at a specific time and contains information such as sensor ID, reading type, reading value, timestamp, and sensor location. In this specific scenario, there are 226 rows in the 'SensorReadings' table, with each row containing 87 columns, each column capable of storing data up to 79 characters. These columns include information like sensor ID, reading type, reading value, timestamp, sensor location, and more. During the operation of the sensor network, multiple sensors might transmit data to the database simultaneously. These concurrent write requests can sometimes lead to contention for database locks. In this specific scenario, there are 84 threads attempting to write data to the 'SensorReadings' table concurrently. Due to the database's locking mechanism and the high number of concurrent write operations, there might be contention for locks on the table. This contention can result in delays or blocking of write operations, ultimately impacting the real-time processing of sensor data and potentially causing anomalies in the system.\n" + }, + "139": { + "start_time": "1697317125", + "end_time": "1697317168", + "start_timestamp": "2023-10-15 04:58:45", + "end_timestamp": "2023-10-15 04:59:28", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 136\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 3657776\n \n # Size of each column (in characters)\n column_size = 81\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the scenario of an online store's database, when 136 users simultaneously search for products using terms such as name, category, and price range in a database table containing 18 columns, 3,657,776 rows of product records, each column size being 81 characters, an exception is triggered due to the lack of a necessary index.\n", + "desc": "In a file sharing system scenario, there is a database called 'TeamFileShareDB', which is used by teams or organizations for sharing files. This database stores both the files themselves and the metadata associated with the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During a typical workday, multiple users are constantly uploading, downloading, and editing files. For example, a project team may be collaborating on an important report, with team members frequently uploading the latest versions of files and others downloading them for viewing or editing. Additionally, the system is used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. In this particular scenario, the 'TeamFileShareDB' database is facing the challenge of a vacuum operation. The vacuum operation is a type of database maintenance operation that reclaims unused space and reorganizes the database to improve performance. When the database has a large number of updates or deletions, it can result in unused space within the database files, which can impact performance and increase storage usage. The vacuum operation is necessary to reclaim this space and optimize the performance of the database.The script is running with 136 threads, indicating a high level of concurrency in the database operations. The 'VACUUM' anomaly is triggered with specific parameters, including 18 columns in the table, each with a size of 81 characters, and a total of 3,657,776 rows of data. This indicates that the database has a large amount of data that requires vacuuming.During the vacuum operation, the database administrator is clearing out unused or outdated data, optimizing storage space, and improving the database's overall performance. However, due to the large scale of the operation and the high level of concurrent database access, there might be contention for database resources, such as CPU and disk I/O. This contention can cause delays in the vacuum operation and impact the overall performance of the file sharing system.\n" + }, + "140": { + "start_time": "1697317228", + "end_time": "1697317341", + "start_timestamp": "2023-10-15 05:00:28", + "end_timestamp": "2023-10-15 05:02:21", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 86\n \n # Number of rows to insert\n num_rows = 415253\n \n # Size of each column (in characters)\n column_size = 87\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database for an e-commerce platform, there are 5 users simultaneously querying a table with 86 columns and 415,253 rows of product records. Each column has a size of 87 characters. The query involves redundant indexes that were created initially but are later deleted. The purpose is to simulate the impact of additional storage and performance overhead caused by this process.\n", + "desc": "In a business intelligence scenario, especially involving the financial statements of large companies, redundant index creation statements are caused by the need to perform index acceleration for queries such as financial analysis. Consider a database named 'CorporateFinanceDB', which is specialized in storing and processing the financial data of large corporations. This database contains multiple tables, one of which is a key table named 'FinancialRecords', recording various financial transactions and statement information of the company. Suppose this table contains 415,253 rows of data, each row representing a financial record. These rows have 86 columns, each containing information of up to 87 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, etc. In a typical business intelligence analysis process, to quickly respond to various complex query demands, such as departmental budget analysis, quarterly income reports, or annual audits, the database administrator might create a large number of indexes before queries. These indexes could be based on transaction type, date range, department, or project code. Suppose at a specific moment, 5 users simultaneously conduct complex financial queries on the 'FinancialRecords' table. To enhance the efficiency of these queries, the administrator creates multiple redundant indexes before the queries start and then deletes them after the queries are completed. This frequent creation and deletion of indexes can lead to additional storage usage and performance overhead in the database. Moreover, frequent index operations might cause database fragmentation, further impacting performance. In a business intelligence environment, this could result in delayed report generation, affecting the efficiency of the decision-making process.\n" + }, + "141": { + "start_time": "1697317402", + "end_time": "1697317492", + "start_timestamp": "2023-10-15 05:03:22", + "end_timestamp": "2023-10-15 05:04:52", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users upload, download, or edit files simultaneously, the file system experiences competition for input/output (I/O) resources. This results in a slowdown of file transfer operations.\n", + "desc": "In an IoT scenario, let's imagine a database used for collecting and analyzing sensor data. This database is known as 'SensorDataDB' and is specifically designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings' and it contains fields to store data from multiple sensors. These fields may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. When a large number of sensors start transmitting data simultaneously at a very high frequency, it can cause performance issues in the database. This is because the database may not have effective data partitioning, sufficient buffering mechanisms, or proper indexing to handle the concurrent write requests. As a result, the database may experience I/O contention, which can lead to slower file transfer speeds and impact the overall performance of the system.\n" + }, + "142": { + "start_time": "1697317552", + "end_time": "1697317613", + "start_timestamp": "2023-10-15 05:05:52", + "end_timestamp": "2023-10-15 05:06:53", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a busy online platform, multiple users simultaneously perform join operations on a database table with poor performance. This causes high contention for CPU resources and slows down the performance of the system.\n", + "desc": "In a business intelligence scenario, particularly in a database called 'CorporateAnalyticsDB', used for analyzing business data, there is a performance issue related to join queries. These join queries are used to generate comprehensive reports by combining data from multiple tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. However, due to the lack of effective indexing on join keys or improper query optimization, these join operations are slow and inefficient. When multiple complex join queries are executed simultaneously, the database server experiences CPU contention, as there is competition for CPU resources. This can further degrade the performance of these queries and impact the efficiency of generating business reports.\n" + }, + "143": { + "start_time": "1697317673", + "end_time": "1697317820", + "start_timestamp": "2023-10-15 05:07:53", + "end_timestamp": "2023-10-15 05:10:20", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce database, when searching for the amount of inventory for each product, the system performs related subqueries. However, if these subqueries are not optimized and there are a large number of products, the performance of the inventory query may be adversely affected.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used to collect and analyze sensor data. The database has a table called 'SensorDataDB' that stores data from various sensors. Each row in the table represents a reading from a sensor, and there are 200,000 rows in total. The table has 20 columns, including sensor ID, reading type, value, timestamp, location, and status. When there is a large amount of data being generated by the sensors and multiple users are querying the database for specific readings, the database might face performance issues. This is especially true when complex queries involving correlated subqueries are executed to retrieve data for specific sensor types or locations. Due to the lack of proper indexing or query optimization, these queries could take a long time to execute and put a strain on the database's resources, leading to inefficiencies in retrieving large sets of data.\n" + }, + "144": { + "start_time": "1697317880", + "end_time": "1697317952", + "start_timestamp": "2023-10-15 05:11:20", + "end_timestamp": "2023-10-15 05:12:32", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 155\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 17\n \n # Number of rows to insert\n num_rows = 60\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data logging system, 155 data logs need to be inserted into a database table simultaneously. Each data log contains 17 columns, with each column having a size of 51 characters. The dataset consists of 60 logs. This script simulates the database exception caused by the simultaneous insertion of a large amount of data.\n", + "desc": "In an Internet of Things (IoT) scenario, consider a database used to collect and analyze sensor data from various sources. This database, called 'SensorDataDB', is designed to handle a large volume of data. It contains a primary table called 'SensorReadings' that stores information such as sensor ID, reading type, reading value, timestamp, sensor location, and status. In this particular scenario, there is a need to insert a large amount of data into the 'SensorReadings' table. The database administrator runs a script using the 'anomaly_trigger' module, specifying the anomaly 'INSERT_LARGE_DATA'. The script is executed with 155 threads, indicating a high level of concurrency, and other parameters such as the number of columns (17), column size (51 characters), and number of rows (60).This scenario simulates a situation where multiple sensors are simultaneously sending data to the database at a high frequency. Due to the large amount of concurrent write requests, the database's ability to handle them efficiently may be limited. This can result in increased write latency, potential database locking, and potentially lead to anomalies in the database system. Specifically, in this scenario, the large amount of concurrent insert operations may cause contention and impact the performance and reliability of the database.\n" + }, + "145": { + "start_time": "1697318012", + "end_time": "1697318084", + "start_timestamp": "2023-10-15 05:13:32", + "end_timestamp": "2023-10-15 05:14:44", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 155\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 26\n \n # Number of rows to insert\n num_rows = 85\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 155 sensors are collecting data from various devices and sending it to the database. This large amount of data needs to be inserted into the database simultaneously. Each device has 26 parameters with a column size of 84 characters, and there are a total of 85 devices. This process may result in an exception in the database due to the high volume of data being inserted.\n", + "desc": "In a business intelligence scenario, suppose there is a database called 'BusinessIntelligenceDB' used for storing and analyzing various business data. This database contains multiple key tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, the company's analysts frequently need to insert a large amount of data into these tables to update or add new records. For example, they might need to insert sales records into the 'SalesData' table, customer information into the 'CustomerProfiles' table, and product information into the 'ProductCatalog' table. Given the size of these tables and the number of rows and columns to be inserted, executing these insert operations on such a large scale can lead to performance issues. If the database or the insert operations are not properly optimized, the insertion of a large amount of data can consume a significant amount of time and resources. This might result in slower data insertion speeds and could potentially impact other operations that rely on the database, such as querying or reporting. Additionally, if such large-scale insert operations occur frequently, they could put additional strain on the server's CPU and memory resources, potentially causing CPU contention and impacting overall performance.\n" + }, + "146": { + "start_time": "1697318144", + "end_time": "1697318204", + "start_timestamp": "2023-10-15 05:15:44", + "end_timestamp": "2023-10-15 05:16:44", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 102\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 52\n \n # Number of rows to insert\n num_rows = 340\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a multi-user system, 102 users simultaneously attempt to update a database table with 52 columns and 340 rows of records, each with a column size of 51 characters. These users compete with each other to lock the database table during the update operation, causing contention and potentially leading to a database exception.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle data from various sensors and contains a key table called 'SensorReadings', which stores sensor readings such as temperature, humidity, pressure, light, and motion. This table consists of 340 rows, each representing a sensor reading, with a total of 52 columns, each containing information with a size of up to 51 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, 102 sensors are actively transmitting data at the same time and trying to update the 'SensorReadings' table. Due to the high concurrency of these update operations, there might be a contention for database locks. This lock contention occurs when multiple sensors are competing for access to the table, causing delays or blocking of certain operations. If this lock contention lasts for a prolonged period, it could affect the real-time nature of the sensor data processing system, leading to delayed or missed sensor readings. This can disrupt the overall functionality and reliability of the IoT system, potentially impacting critical decision-making or automation processes that rely on accurate and timely sensor data.\n" + }, + "147": { + "start_time": "1697318264", + "end_time": "1697318306", + "start_timestamp": "2023-10-15 05:17:44", + "end_timestamp": "2023-10-15 05:18:26", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 107\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 17\n \n # Number of rows to insert\n num_rows = 3637430\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a large e-commerce platform, multiple users perform a search operation after conducting a data cleaning process on a database table containing 17 columns and 3,637,430 rows of product records, each with a column size of 84 characters. This search operation is simulated to trigger an exception in the database due to the lack of necessary optimization for such a large dataset. The search is performed concurrently by 107 users, emphasizing the potential performance and scalability issues that may arise in real-life scenarios.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'SensorDataDB', which is designed to collect and analyze sensor data. This database is used to handle a large volume of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which stores information about different readings from sensors. It contains 3,637,430 rows of data, each representing a reading from a sensor. The table consists of 17 columns, each with a size of 84 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this scenario, the database administrator needs to perform a VACUUM operation on the 'SensorReadings' table. VACUUM is a maintenance operation that reclaims storage occupied by deleted or outdated data. It helps optimize the performance and efficiency of the database by freeing up unused space.During the VACUUM operation, 107 threads are used to process the data. These threads work simultaneously to identify and remove the unnecessary data. The goal is to reorganize the storage and improve the database's efficiency.This VACUUM operation is crucial for the database's optimal performance because it eliminates excess storage usage and reduces fragmentation. However, performing such a large-scale VACUUM operation can also impact the database's overall performance. It requires careful planning and consideration to minimize any potential disruptions to the normal operation of the IoT system.\n" + }, + "148": { + "start_time": "1697318366", + "end_time": "1697318480", + "start_timestamp": "2023-10-15 05:19:26", + "end_timestamp": "2023-10-15 05:21:20", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 95\n \n # Number of rows to insert\n num_rows = 804778\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online retail database, indexes are created redundantly for various product attributes like name, category, and price range. This leads to additional storage requirements and performance overhead. Simulate the effects of this situation with 8 threads searching in a database table containing 95 columns and 804,778 rows of product records, where each column has a size of 78 characters.\n", + "desc": "In an internet of things (IoT) scenario, let's say there is a database called 'IoTDataDB' that stores data from various IoT devices. This database is designed to handle a large volume of data generated by these devices. One of the key tables in the database is called 'DeviceData', which contains information about the devices and the data they generate. This table has 804,778 rows of data, with each row representing a data entry from a device. It has a total of 95 columns, each column containing information of up to 78 characters. These columns may include device ID, device type, location, sensor readings, timestamp, device status, and other relevant information.In this scenario, the database administrators want to optimize the performance of the database for querying and analyzing the data from these IoT devices. To enhance the efficiency of these queries, they decide to create redundant indexes on certain columns, such as device type, location, and timestamp. These indexes can help accelerate the execution of queries that involve filtering or sorting based on these columns.However, the creation of these redundant indexes can have unintended consequences. It can lead to additional storage usage and might impact the performance of data insertion and update operations. Moreover, frequent index operations can cause fragmentation in the database, further affecting performance. In an IoT environment, where data is generated and updated continuously, these redundant index operations, if not properly managed, can trigger anomalies and impact the overall performance of the database.\n" + }, + "149": { + "start_time": "1697318540", + "end_time": "1697318631", + "start_timestamp": "2023-10-15 05:22:20", + "end_timestamp": "2023-10-15 05:23:51", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a busy file sharing system, multiple users are uploading, downloading, or editing files at the same time. This creates competition for I/O resources, causing file transfer to slow down.\n", + "desc": "In an IoT (Internet of Things) scenario, imagine there is a smart home system that collects sensor data from various devices, such as temperature sensors, motion sensors, and light sensors. The system uses a database called 'SensorDataDB' to store and analyze this data. The database contains a table named 'SensorReadings', which records the readings from these sensors. Each row in this table represents a specific reading from a sensor and includes information such as the sensor ID, reading type (e.g., temperature, motion), reading value, timestamp, and sensor location.In this scenario, multiple sensors in the smart home system start transmitting data simultaneously at a very high frequency. This results in a large number of write requests being sent to the database to store this sensor data. However, due to insufficient buffering mechanisms in the database or inadequate indexing, the database struggles to handle these numerous concurrent write requests efficiently. This leads to I/O contention, where the storage and network bandwidth of the system are significantly strained. As a result, the file transfer speeds in the system become slower, especially when there is limited bandwidth or insufficient server processing capabilities.Furthermore, frequent write operations in the database can also impact its overall performance. During peak periods, the database might encounter locking and transaction management issues, further slowing down the file processing and metadata recording. This can affect the performance and responsiveness of the entire smart home system.\n" + }, + "150": { + "start_time": "1697318691", + "end_time": "1697318752", + "start_timestamp": "2023-10-15 05:24:51", + "end_timestamp": "2023-10-15 05:25:52", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database used for analyzing customer data, there is a situation where the execution of join operations between multiple tables is not optimized, leading to poor performance. Additionally, there is high competition for CPU resources due to other processes running on the system, resulting in slower query execution.\n", + "desc": "In a Business Intelligence (BI) scenario, suppose there is a database used for recording and analyzing various business data of a large corporation. This database contains multiple complex tables, each filled with a large number of rows and columns, recording detailed business information. In this particular situation, the company's analysts often need to perform complex join queries across multiple tables to generate comprehensive business reports. However, due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there may be competition for CPU resources, leading to reduced query efficiency. This CPU contention may occur if there are too many compute-intensive queries running on the database server or if the server's CPU resources are insufficient to handle these queries.\n" + }, + "151": { + "start_time": "1697318812", + "end_time": "1697318959", + "start_timestamp": "2023-10-15 05:26:52", + "end_timestamp": "2023-10-15 05:29:19", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online marketplace, when trying to retrieve a large amount of data such as inventory information for each product, the query involves correlated subqueries. If these subqueries are not optimized properly, the performance of the inventory retrieval process may be negatively affected.\n", + "desc": "In the database of an e-commerce platform, there is a database named 'ECommerceDB' that stores information about various products. One important table in this database is called 'ProductInventory', which contains inventory data for tens of thousands or even hundreds of thousands of products. This includes information such as the product ID, current stock level, last inventory update time, supplier ID, and warehouse location. In this database, there are queries that involve related subqueries to retrieve inventory information for specific products or categories. One common query is to determine the total current inventory of all products within a specific category. To perform this query, the database needs to select all products of the desired category from the 'ProductInventory' table and then conduct subqueries to retrieve the inventory data for these products. However, when the number of products is large, these subqueries can become inefficient and result in slower query performance. This is because the database needs to read a significant amount of data from the disk, leading to potential I/O bottlenecks.\n" + }, + "152": { + "start_time": "1697319019", + "end_time": "1697319091", + "start_timestamp": "2023-10-15 05:30:19", + "end_timestamp": "2023-10-15 05:31:31", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 161\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 85\n \n # Size of each column (in characters)\n column_size = 72\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, a large amount of data generated by 161 sensors needs to be inserted into the database simultaneously. The database contains a table with 9 columns and 85 rows, with each column having a size of 72 characters. This process simulates the database exception caused by the high volume of data insertion.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for storing and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in this database is named 'SensorReadings', which stores information about the readings from these sensors. Each row in this table represents a specific sensor reading, and there are a total of 85 rows of data. The table has 9 columns, each containing information of up to 72 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In a specific situation, 161 sensors start transmitting data simultaneously at a very high frequency. This influx of data can lead to performance issues in the database. Due to factors such as lack of efficient data partitioning, insufficient buffering mechanisms, or improper indexing, the database may struggle to process these numerous concurrent write requests efficiently. This can result in increased write latency, which can ultimately lead to anomalies in the database. These anomalies can affect the overall functionality and performance of the IoT system.\n" + }, + "153": { + "start_time": "1697319151", + "end_time": "1697319223", + "start_timestamp": "2023-10-15 05:32:31", + "end_timestamp": "2023-10-15 05:33:43", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 161\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 28\n \n # Number of rows to insert\n num_rows = 94\n \n # Size of each column (in characters)\n column_size = 95\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, 161 data sources are simultaneously inserting a large amount of data with 28 columns, where each column can store up to 95 characters, into the database. This process may lead to database exceptions due to the high volume and concurrency of data insertion.\n", + "desc": "In the context of an Internet of Things (IoT) scenario, let's consider a database used for sensor data analysis and storage called 'SensorDataDB'. This database is specifically designed to handle a large amount of data from various sensors. In this scenario, the anomaly being triggered is related to the insertion of large data. The script executes commands to insert a significant amount of data into the database. The specified parameters indicate that the script will insert data using 161 threads, with each row containing 28 columns, and each column having a size of 95 characters. Additionally, a total of 94 rows will be inserted into the database. Such a large-scale insertion of data can potentially cause performance issues in the database, especially in terms of processing speed and resource utilization.\n" + }, + "154": { + "start_time": "1697319283", + "end_time": "1697319343", + "start_timestamp": "2023-10-15 05:34:43", + "end_timestamp": "2023-10-15 05:35:43", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 183\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 72\n \n # Number of rows to insert\n num_rows = 359\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online gaming platform's database, 183 players are simultaneously trying to update information in a table containing 72 columns and 359 rows of player records, with each column having a size of 84 characters. These players are competing with each other to lock the table for updates, leading to a database exception.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The main table in the database is called 'SensorReadings', which contains information about sensor readings such as temperature, humidity, pressure, light, motion, etc. The table has 359 rows, each representing a reading from a sensor, with 72 columns containing information of up to 84 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. With a high number of concurrent sensor readings, the database might experience locking contention. This means that when multiple users or sensors try to update the same or adjacent rows in the table simultaneously, there might be a competition for locking the database, leading to performance issues and delayed processing of other users' or sensors' requests. To simulate this scenario, the script is triggering the anomaly 'LOCK_CONTENTION' with 183 threads, representing a high number of concurrent update operations. This helps to evaluate the impact of locking contention on the performance of the 'SensorDataDB' database in an IoT environment.\n" + }, + "155": { + "start_time": "1697319403", + "end_time": "1697319469", + "start_timestamp": "2023-10-15 05:36:43", + "end_timestamp": "2023-10-15 05:37:49", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 188\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 2343320\n \n # Size of each column (in characters)\n column_size = 95\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, there are 188 users searching in the database table containing 9 columns, 2,343,320 rows. Each column has a size of 95 characters. The search operation is performed after a large-scale data cleaning operation.\n", + "desc": "In the IoT scenario, there is a database specifically used for collecting and analyzing sensor data. This database contains a table named 'SensorDataDB' which stores data from various types of sensors. The table consists of 2,343,320 rows of data, with each row representing a single sensor reading. There are 9 columns in the table, each with a size of 95 characters. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. During regular operation, the sensor data is constantly being updated and new readings are being added. Over time, the database may accumulate a large amount of outdated or unnecessary data. To improve the performance and efficiency of the database, the administrator needs to periodically perform a data cleanup operation called 'VACUUM'. This operation involves identifying and removing obsolete or unused data from the table, freeing up storage space and optimizing the database for future use. In this specific case, the 'VACUUM' operation is triggered with the settings of 188 threads, meaning that multiple concurrent threads will be used to perform the cleanup operation. This can speed up the process and reduce the time required for the cleanup. Additionally, the table has 9 columns, each with a size of 95 characters, and a total of 2,343,320 rows of data. This indicates a significant amount of data that needs to be processed during the cleanup operation.By performing the 'VACUUM' operation with these settings, the database administrator can ensure the efficient storage and management of the sensor data in the 'SensorDataDB' table, preventing any potential anomalies or performance issues in the IoT system.\n" + }, + "156": { + "start_time": "1697319530", + "end_time": "1697319644", + "start_timestamp": "2023-10-15 05:38:50", + "end_timestamp": "2023-10-15 05:40:44", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 91\n \n # Number of rows to insert\n num_rows = 928167\n \n # Size of each column (in characters)\n column_size = 94\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace, the database is configured with redundant indexes for various attributes such as product name, category, and price range. However, when 9 users simultaneously perform queries on a database table with 91 columns, 928,167 rows, and column sizes of 94 characters, the performance of the database may be negatively impacted due to the extra storage footprint and overhead caused by the redundant indexes.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that is used for storing and processing sensor data. This database contains a key table named 'SensorData', which records data from various sensors. The table consists of 928,167 rows of data, each representing a sensor reading, with a total of 91 columns. These columns store information such as sensor ID, data type (e.g. temperature, humidity, pressure), data value, timestamp, location, and status.In this scenario, there might be a need to perform index optimization for querying sensor data. For example, when analyzing temperature trends over time, or comparing humidity levels across different locations, creating indexes on specific columns can help improve query performance. However, if redundant indexes are created, meaning multiple indexes on the same columns or sets of columns, it can lead to unnecessary overhead in terms of storage and query execution. Redundant indexes can consume additional storage space and slow down query performance due to the extra time required for index maintenance.When running the provided command with the 'REDUNDANT_INDEX' anomaly, the script aims to simulate the impact of redundant index creation in this IoT data scenario. By specifying the number of threads (9), the number of columns (91), the column size (94 characters), and the number of rows (928,167), the script will generate a workload that triggers redundant index issues. This workload might involve concurrent queries that experience performance slowdowns, potential storage overhead, and increased query execution time due to the presence of redundant indexes in the database.\n" + }, + "157": { + "start_time": "1697319704", + "end_time": "1697319795", + "start_timestamp": "2023-10-15 05:41:44", + "end_timestamp": "2023-10-15 05:43:15", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a shared file system where multiple users are simultaneously uploading, downloading, or editing files, there is a high amount of I/O contention, causing delays in file transfers. This scenario is simulated using the given script \"python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION\".\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. It contains a primary table called 'SensorDataDB' that stores the sensor readings and other relevant information. However, when a large number of sensors start transmitting data simultaneously at a high frequency, the database might encounter performance issues. This is mainly due to the lack of efficient data partitioning or insufficient buffering mechanisms, resulting in I/O contention. As a result, the database's ability to process these numerous concurrent write requests is limited, leading to slower file transfer speeds and impacting the overall performance of the system.\n" + }, + "158": { + "start_time": "1697319855", + "end_time": "1697319915", + "start_timestamp": "2023-10-15 05:44:15", + "end_timestamp": "2023-10-15 05:45:15", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system for a financial institution, multiple users are attempting to perform join operations on large tables, which require significant computational power. However, due to high CPU contention, the performance of these join operations is poor.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'SensorDataDB' that stores data collected from various sensors. This database is designed to handle a large volume of sensor data. One of the key tables in this database is 'SensorReadings', which contains information about each sensor reading, such as sensor ID, reading type, value, timestamp, and location. In this scenario, there might be instances where multiple users or systems are simultaneously querying the 'SensorReadings' table, particularly to perform complex join queries. These join queries involve combining data from the 'SensorReadings' table with other tables to gain additional insights or conduct advanced analysis. However, due to poor join performance, these queries might take a long time to execute, impacting the overall efficiency of the database. Additionally, if there is CPU contention, it means that there are too many compute-intensive queries running on the database server at the same time, or the server's CPU resources are insufficient to handle all the queries. This can further worsen the performance of the join queries, causing delays and reducing the effectiveness of the analysis process.\n" + }, + "159": { + "start_time": "1697319975", + "end_time": "1697320121", + "start_timestamp": "2023-10-15 05:46:15", + "end_timestamp": "2023-10-15 05:48:41", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online e-commerce platform, when trying to retrieve a large amount of data from the database, specifically the inventory information for each product, the process may involve executing related subqueries. If these subqueries are not optimized, the performance of retrieving inventory data can decrease.\n", + "desc": "In an IoT scenario, consider a database called 'IoTDataDB', which is used to store and analyze sensor data from various IoT devices. This database contains a key table named 'SensorData', which records data from different sensors, such as temperature, humidity, pressure, light, and motion sensors. Each row in the table represents a data reading from a specific sensor, with multiple columns storing information such as sensor ID, reading type, reading value, timestamp, and sensor location. Suppose there is a situation where a large amount of sensor data needs to be fetched and processed. For example, the user wants to retrieve all data from all sensors within a specific time range or with specific readings. To accomplish this, the database needs to execute correlated subqueries, where the results of one subquery depend on the results of another subquery. However, when dealing with a large volume of sensor data, executing these correlated subqueries can lead to performance issues.This is mainly due to the fact that correlated subqueries require the database to perform multiple scans of the sensor data table, resulting in a high I/O workload. Retrieving a large amount of data from disk can cause I/O bottlenecks and result in slower query execution times. Additionally, if the database is not properly indexed or optimized for these types of queries, the performance degradation can be even more pronounced.In the context of an IoT application, fetching large amounts of data with correlated subqueries can impact the overall efficiency and real-time processing capabilities of the system. It might cause delays in data analysis, hinder timely decision-making, and affect the overall performance of the IoT platform.\n" + }, + "160": { + "start_time": "1697320182", + "end_time": "1697320253", + "start_timestamp": "2023-10-15 05:49:42", + "end_timestamp": "2023-10-15 05:50:53", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 98\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 14\n \n # Number of rows to insert\n num_rows = 78\n \n # Size of each column (in characters)\n column_size = 44\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analysis system, there is a need to insert a large amount of data into the database simultaneously. This process involves inserting data from 98 sources, with each source having 14 columns and each column having a size of 44 characters. The total number of rows being inserted is 78.\n", + "desc": "In the database of an IoT system, let's suppose there is a database named 'IoTDataDB', which is responsible for storing and processing data from various sensors. This database is designed to handle a large volume of sensor data and contains a key table named 'SensorData', which records information from 78 sensors. Each sensor data entry includes details such as sensor ID, sensor type, sensor value, timestamp, location, and status. During a specific experiment, 98 sensors start sending data simultaneously to the database at a high frequency. Due to the lack of effective data partitioning, insufficient buffering mechanisms, or improper indexing, the database's ability to handle these concurrent write requests is limited. This can lead to increased write latency, resulting in delayed or failed data insertion in the database. Moreover, this inefficiency can also impact other database operations, leading to anomalies in the system.\n" + }, + "161": { + "start_time": "1697320313", + "end_time": "1697320384", + "start_timestamp": "2023-10-15 05:51:53", + "end_timestamp": "2023-10-15 05:53:04", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 98\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 33\n \n # Number of rows to insert\n num_rows = 62\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a system where sensor data is being collected and inserted into a database, 98 sensors are generating a large amount of data. The database table has 33 columns, each with a size of 92 characters, and there are 62 rows of data to be inserted. This simulates the scenario of inserting a large volume of data into the database, which can potentially cause exceptions or slow down the insertion process.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used for collecting and storing sensor data from various devices named 'SensorDataDB'. This database is designed to handle a large volume of data from different types of sensors. In this particular case, the database administrator needs to insert a large amount of data into the 'SensorReadings' table, which is the primary table in the database. This table contains information from 62 sensors, with each sensor generating data for 33 different attributes. These attributes may include sensor ID, reading type, reading value, timestamp, location, and status. Due to the high number of threads (98) and the large size of the data being inserted (each attribute can contain up to 92 characters), this operation has the potential to trigger anomalies within the database. These anomalies could be caused by factors such as insufficient database optimization, inadequate indexing, or the lack of buffering mechanisms. If these issues are not addressed, the database's performance could be negatively impacted, resulting in slower insertion speeds and potential inconsistencies in the stored data.\n" + }, + "162": { + "start_time": "1697320444", + "end_time": "1697320505", + "start_timestamp": "2023-10-15 05:54:04", + "end_timestamp": "2023-10-15 05:55:05", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 60\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 55\n \n # Number of rows to insert\n num_rows = 340\n \n # Size of each column (in characters)\n column_size = 88\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 60 users simultaneously attempt to perform frequent update operations in a database table containing 55 columns and 340 rows of product records each with a column size of 88 characters. These users compete with each other to lock the database table to perform the update operations.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider an IoT device management database, named 'DeviceManagementDB'. This database is responsible for storing information about various IoT devices, such as sensors, actuators, or connected devices, and managing their configurations, updates, and statuses. Within this database, there is a crucial table called 'DeviceStatus', which records the current status and operational information of each device. For example, this table might contain device ID, device type, last communication timestamp, firmware version, battery level, signal strength, device location, and more. Suppose that, due to certain circumstances, or a large influx of new IoT devices being added to the network, many devices start sending their status updates at the same time. As a result, multiple threads from the device management system simultaneously attempt to update the 'DeviceStatus' table. However, due to table-level or row-level locks in the database, concurrent update operations might experience contention issues, resulting in delays, blocking, or even failed database operations. This lock contention can ultimately lead to performance degradation in the IoT device management system, affecting the real-time monitoring and management of IoT devices.\n" + }, + "163": { + "start_time": "1697320565", + "end_time": "1697320625", + "start_timestamp": "2023-10-15 05:56:05", + "end_timestamp": "2023-10-15 05:57:05", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 187\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 14\n \n # Number of rows to insert\n num_rows = 2541105\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 187 users concurrently search for products using terms such as product name, category, price range, etc. on a database table with 14 columns and 2,541,105 rows. Each column has a size of 92 characters. This scenario simulates an exception caused by heavy search activity after a large-scale data cleaning operation (vacuum) on the database table.\n", + "desc": "In an e-commerce environment, there is a database called 'StoreDatabase', which stores various information related to products. One important table in this database is called 'ProductDetails', which contains data about different products available for sale. This table consists of 2,541,105 rows, with each row representing a unique product entry. The table has a total of 14 columns, each with a size of 92 characters. These columns may include product SKU, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, and country. Occasionally, due to business requirements such as product updates or data cleanup, the administrator needs to perform a VACUUM operation on the database. This operation involves reclaiming unused disk space and optimizing the table structure to improve overall performance. However, doing a VACUUM operation on a large table like 'ProductDetails' with a high number of threads (187 in this case) could potentially affect the operation of the database. This is because the VACUUM operation generates intense disk I/O and locks the table for a period of time, leading to potential delays or failures in other database operations. Consequently, the overall performance and availability of the e-commerce platform might be impacted.\n" + }, + "164": { + "start_time": "1697320685", + "end_time": "1697320799", + "start_timestamp": "2023-10-15 05:58:05", + "end_timestamp": "2023-10-15 05:59:59", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 75\n \n # Number of rows to insert\n num_rows = 638193\n \n # Size of each column (in characters)\n column_size = 53\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a large e-commerce database with 75 columns and 638,193 rows, each column having a size of 53 characters, there is a scenario where excessive indexes are created for product information such as name, category, and price range. This is followed by a simulation of queries from multiple users, causing unnecessary storage overhead and decreased performance.\n", + "desc": "In a business intelligence scenario, where data analysis is performed on large corporate financial datasets, there is a database called 'CorporateFinanceDB'. This database contains multiple tables, including a key table called 'FinancialRecords', which stores detailed financial transaction information. The 'FinancialRecords' table contains 638,193 rows of data, with each row representing a financial record and a total of 75 columns, each containing information of up to 53 characters. These columns may include transaction ID, transaction type, amount, date, department, project code, financial year, audit status, and more. During financial analysis, the database administrator might create redundant indexes to accelerate complex queries, such as departmental budget analysis or quarterly income reports. However, the creation of numerous indexes can result in additional storage usage and performance overhead in the database. Furthermore, frequent index operations might cause database fragmentation, which can further impact performance. In a business intelligence environment, this can lead to delayed report generation, affecting the efficiency of the decision-making process.\n" + }, + "165": { + "start_time": "1697320859", + "end_time": "1697320950", + "start_timestamp": "2023-10-15 06:00:59", + "end_timestamp": "2023-10-15 06:02:30", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a shared file system, when multiple users are simultaneously uploading, downloading, or editing files, there is a competition for input/output operations (IO contention). This leads to a slowdown in file transfer and impacts the overall performance of the system.\n", + "desc": "In an IoT scenario involving a smart home system, there is a database named 'SmartHomeDB' that stores data collected from various smart devices within a home. This database consists of multiple tables, one of which is called 'DeviceData', which records detailed information about the different devices in the home. This table contains data from 100 devices, with each device having its own row and each row containing information such as device ID, device type (such as thermostat, smart lock, motion sensor), current status, battery level, connection status, and more. On a typical day, these smart devices constantly communicate with the database to update their statuses and send sensor data. However, when a large number of devices simultaneously send data at a high frequency, it puts a strain on the database's input/output (I/O) subsystem. The I/O contention occurs because the server's storage and network bandwidth might become overloaded, leading to slower data processing and potentially causing delays or failures in data transmission. Additionally, frequent write operations to the database, such as updating device statuses or recording sensor readings, can further exacerbate the I/O contention issue. Overall, this I/O contention in the database can negatively impact the responsiveness and performance of the entire smart home system.\n" + }, + "166": { + "start_time": "1697321010", + "end_time": "1697321071", + "start_timestamp": "2023-10-15 06:03:30", + "end_timestamp": "2023-10-15 06:04:31", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis system, there are simultaneous join operations being performed between multiple tables, but the join performance is poor due to inefficient query optimization. Additionally, there is high CPU contention as multiple users are competing for CPU resources, leading to slower query execution and performance degradation.\n", + "desc": "In a business intelligence scenario, a database named 'CorporateAnalyticsDB' is used by a large corporation to store and analyze various business data. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. The analysts in the company frequently need to perform complex join queries across these tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table, which contains sales records, with the 'CustomerProfiles' table to analyze the purchasing behaviors of different customer groups. Additionally, they might also need to link this data with the 'ProductCatalog' and 'MarketTrends' tables to gain deeper market insights. However, due to the large size of these tables and the complexity of the join operations, executing these queries can be slow. This could be due to ineffective indexing of the join keys or insufficient query optimization. During peak periods when multiple complex join queries are executed simultaneously, there might be competition for CPU resources, further reducing query efficiency. This CPU contention can occur if there are too many compute-intensive queries running on the database server or if the server's CPU resources are insufficient to handle the workload.\n" + }, + "167": { + "start_time": "1697321131", + "end_time": "1697321277", + "start_timestamp": "2023-10-15 06:05:31", + "end_timestamp": "2023-10-15 06:07:57", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, when trying to retrieve a large amount of data, such as inventory information for each product, the retrieval process uses correlated subqueries. If these subqueries are not optimized, executing them for a large number of products can result in poor query performance.\n", + "desc": "In an e-commerce platform's database, there is a table called 'ProductInventory' that stores inventory information for various products. This table contains data for tens of thousands or even hundreds of thousands of products, including their ID, current stock level, last update time, supplier ID, and warehouse location. When querying the inventory level of products, related subqueries are performed. For example, a common query might be to determine the total inventory of products in a specific category. This involves selecting products from the 'ProductDetails' table and conducting subqueries on the 'ProductInventory' table to obtain the inventory data. However, when dealing with a large number of products, these related subqueries can become time-consuming and inefficient. Reading a significant amount of data from disk may be required, leading to potential I/O bottlenecks.\n" + }, + "168": { + "start_time": "1697321338", + "end_time": "1697321410", + "start_timestamp": "2023-10-15 06:08:58", + "end_timestamp": "2023-10-15 06:10:10", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 195\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 86\n \n # Size of each column (in characters)\n column_size = 33\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a logging system where multiple devices constantly send logs, a large number of logs need to be simultaneously inserted into the database. The insertion process is simulated with 195 threads, inserting logs into a database table with 12 columns and 86 rows. Each column has a size of 33 characters. This can trigger a database exception due to the high volume of simultaneous insertions.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'BusinessIntelligenceDB', which is utilized for storing and analyzing various business data. This database consists of multiple tables, one of which is a key table named 'DataRecords', recording detailed information about different data sources. Specifically, this table contains 86 rows of data, each representing a data record, with a total of 12 columns, each column having a size of 33 characters. These columns might include data source ID, data type, data size, source location, data quality, date of data acquisition, data owner, and more.In this particular scenario, there is a need to perform an operation of inserting a large amount of data into the 'DataRecords' table. The database administrator decides to insert data for 195 different data sources, with each source having its own unique set of attributes. However, due to the lack of necessary optimization techniques or infrastructure limitations, the insertion of such a large amount of data can strain the resources and performance of the database. This can result in increased latency in the insertion process, potential conflicts with existing data, or even a complete failure in inserting all the desired data.Therefore, it is important for the database administrator to carefully plan and optimize the insertion process, taking into account factors such as the available system resources, table design, efficient data processing techniques, and load balancing across multiple threads. By implementing these optimizations, the database can handle the insertion of large data sets more efficiently and reduce the risk of anomalies or performance issues.\n" + }, + "169": { + "start_time": "1697321470", + "end_time": "1697321543", + "start_timestamp": "2023-10-15 06:11:10", + "end_timestamp": "2023-10-15 06:12:23", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 195\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 33\n \n # Number of rows to insert\n num_rows = 71\n \n # Size of each column (in characters)\n column_size = 81\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, such as a financial system, a large amount of data generated by 195 sources needs to be inserted into the database simultaneously. This data insertion process involves 33 columns and 71 rows, with each column size set to 81 characters. The aim is to mimic the database exception that could occur due to the high volume of data being inserted at once.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'SmartHomeDB' designed for collecting and analyzing sensor data in smart homes. This database consists of a primary table called 'SensorReadings', which records various readings from sensors installed in a smart home, such as temperature, humidity, motion, light, and more. Each sensor reading is stored as a separate row in the table, with a total of 71 rows of data. The 'SensorReadings' table contains 33 columns, each representing different attributes of the sensor readings, such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. These columns can store information of up to 81 characters. In this specific situation, 195 sensors installed in a smart home start transmitting data simultaneously at a high frequency. The database needs to handle these large volumes of data. However, due to various factors like insufficient buffering mechanisms, lack of proper indexing, or inadequacy in handling multiple write requests simultaneously, the database's performance may be impacted when processing these concurrent write operations efficiently. This can result in delayed data recording or even database locking. These performance issues might lead to anomalies in the database and affect the overall functionality and efficiency of the smart home system.\n" + }, + "170": { + "start_time": "1697321603", + "end_time": "1697321663", + "start_timestamp": "2023-10-15 06:13:23", + "end_timestamp": "2023-10-15 06:14:23", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 157\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 98\n \n # Number of rows to insert\n num_rows = 376\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online store, there is a scenario where 157 users simultaneously attempt to perform frequent update operations. The database table they are updating has 98 columns and 376 rows of product records, each with a column size of 67 characters. These users compete with each other to lock the database table for updates, simulating a scenario of lock contention.\n", + "desc": "In a database scenario related to file sharing, imagine a database system named 'FileShareDB', which is designed for teams or organizations to share files. This database not only stores the actual files but also maintains the metadata associated with each file, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. In this particular situation, the 'FileShareDB' database is facing an anomaly related to lock contention, which occurs when multiple users are simultaneously accessing or modifying the same or nearby rows of data. This contention arises when 157 users attempt to perform various file operations, such as uploading, downloading, or modifying files, on the 'FileShareDB' database at the same time. Due to the database's locking mechanism, these concurrent operations can lead to conflicts and delays in accessing or modifying the data. If such contention persists for a long duration, it can significantly impact the performance and responsiveness of the database, potentially causing delays, failures, or even data inconsistencies in file sharing operations.\n" + }, + "171": { + "start_time": "1697321723", + "end_time": "1697321805", + "start_timestamp": "2023-10-15 06:15:23", + "end_timestamp": "2023-10-15 06:16:45", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 88\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 2664318\n \n # Size of each column (in characters)\n column_size = 81\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an e-commerce platform's database, 88 users simultaneously perform a search operation after a large-scale data cleaning process on a database table containing 12 columns, 2,664,318 rows, with each column having a size of 81 characters for product records. The aim is to simulate the exception that occurs during this process.\n", + "desc": "In an e-commerce database called 'OnlineStoreDB', there is a table known as 'ProductRecords' that stores detailed information about various products. This table contains a total of 2,664,318 rows, with each row representing a specific product entry. The table consists of 12 columns, each capable of holding up to 81 characters. These columns include information such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, and expiration date. In this scenario, the database administrator needs to perform a VACUUM operation, which is a process of reclaiming storage space and improving the performance of the database. The VACUUM operation involves removing unnecessary data or reclaiming space from deleted or updated rows. Since the 'ProductRecords' table is quite large and contains a significant number of rows, the VACUUM operation might take a considerable amount of time and system resources to complete.If the VACUUM operation is not properly optimized or if it is executed during peak traffic periods, it could potentially lead to performance issues in the database. These performance issues might include increased response times, slower queries, or even temporary interruptions in the availability of the database. To minimize the impact of the VACUUM operation on the database and ensure smooth operation, it is recommended to perform the operation during low-traffic periods or to optimize the operation by using batch processing or incremental deletion techniques.\n" + }, + "172": { + "start_time": "1697321865", + "end_time": "1697321978", + "start_timestamp": "2023-10-15 06:17:45", + "end_timestamp": "2023-10-15 06:19:38", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 79\n \n # Number of rows to insert\n num_rows = 990412\n \n # Size of each column (in characters)\n column_size = 58\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online platform database, a search operation is performed by 6 users simultaneously on a table with 79 columns and 990,412 rows, with each column containing 58 characters. This dataset has a large number of redundant indexes, which can result in additional storage space and performance overhead.\n", + "desc": "In an e-commerce platform database, there is a table called 'ProductCatalog' that stores information about various products. This table contains almost one million rows of data, with each row representing a unique product entry. The table consists of 79 columns, each column able to store information up to 58 characters long. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other relevant attributes. However, due to the need to improve query performance for complex business intelligence analysis tasks, the database administrators decide to create multiple redundant indexes before executing these queries and then delete them once the queries are completed. This frequent creation and deletion of indexes can lead to additional storage usage and performance overhead in the database. Moreover, frequent index operations might cause database fragmentation and impact performance. In an e-commerce platform, this could result in delayed report generation, affecting the efficiency of the decision-making process.\n" + }, + "173": { + "start_time": "1697322038", + "end_time": "1697322129", + "start_timestamp": "2023-10-15 06:20:38", + "end_timestamp": "2023-10-15 06:22:09", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, there is a high amount of I/O contention. This occurs when multiple users are simultaneously uploading, downloading, or editing files, causing the file transfer process to slow down.\n", + "desc": "In an IoT scenario, we can imagine a database called 'SensorDataDB', which is used to collect and analyze data from various sensors. This database is designed to handle a large volume of data, including sensor readings such as temperature, humidity, pressure, light, and motion. When the sensors start transmitting data simultaneously at a high frequency, the database may experience performance issues. Due to insufficient buffering mechanisms or improper indexing, the database's ability to handle these concurrent write requests is limited. This can result in increased write latency and even database locking, leading to anomalies.\n" + }, + "174": { + "start_time": "1697322189", + "end_time": "1697322249", + "start_timestamp": "2023-10-15 06:23:09", + "end_timestamp": "2023-10-15 06:24:09", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a large database containing multiple tables, there is a poor performance issue with joining tables due to a lack of optimization. This is compounded by high CPU contention, where multiple users are competing for CPU resources, resulting in slower data processing and query response times.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB' that stores various business data of a large corporation. This database contains multiple complex tables for analyzing business information. In this particular scenario, analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. However, due to improper indexing or optimization, these join queries can be very slow and consume a significant amount of time and resources. During peak periods, when multiple join queries are executed simultaneously, there can be competition for CPU resources, further reducing the efficiency of the queries. This CPU contention can occur when there are too many compute-intensive queries running on the database server or when the server's CPU resources are insufficient.\n" + }, + "175": { + "start_time": "1697322309", + "end_time": "1697322457", + "start_timestamp": "2023-10-15 06:25:09", + "end_timestamp": "2023-10-15 06:27:37", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail business, there is a need to fetch a large amount of data from the database, specifically related to inventory information. The query involves executing subqueries that are interdependent. If these subqueries are not optimized, the performance of retrieving inventory data may be negatively affected.\n", + "desc": "In the database of an e-commerce platform, suppose there is a database named 'ECommerceDB', which includes a crucial table named 'ProductInventory' for recording the inventory information of various products. This table might contain inventory data for tens of thousands or even hundreds of thousands of products. The inventory information for each product includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In this database, querying the inventory level of each product may require performing related subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query might first involve selecting all products of a particular category from the 'ProductDetails' table, then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. When the number of products is very large, the performance of these related subqueries can become inefficient. For instance, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. In such cases, due to the need to retrieve inventory information for a large number of products, the database might need to read a significant amount of data from the disk, which could lead to I/O bottlenecks.\n" + }, + "176": { + "start_time": "1697322518", + "end_time": "1697322590", + "start_timestamp": "2023-10-15 06:28:38", + "end_timestamp": "2023-10-15 06:29:50", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 179\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 15\n \n # Number of rows to insert\n num_rows = 67\n \n # Size of each column (in characters)\n column_size = 25\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a real-life scenario, there is a data collection system that involves 179 sensors. These sensors generate a large amount of data simultaneously, which needs to be inserted into a database. The database table has 15 columns, each with a size of 25 characters, and there are 67 rows of data. This process aims to simulate the database exception that can occur due to the insertion of a large amount of data from multiple sensors at the same time.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDB' used to store and analyze sensor data generated by various smart devices. This database contains a table named 'SensorData', which records data from 67 different sensors. Each sensor generates data for 15 different attributes, such as temperature, humidity, light intensity, motion detection, and more. The table has a total of 67 rows, each representing a specific sensor, and each row has 15 columns to store the data for each attribute. These columns can store data up to 25 characters in length. At a certain time, 179 smart devices, each equipped with a sensor, start transmitting data simultaneously and at a rapid rate. This influx of data puts a strain on the database's ability to handle such a large number of write requests. The database might encounter performance issues and delays due to insufficient buffering or indexing mechanisms. This can result in slower write speeds and potential anomalies within the database. These anomalies may affect the accuracy and efficiency of data processing and analysis, as well as hinder real-time monitoring and decision-making based on the IoT data.\n" + }, + "177": { + "start_time": "1697322650", + "end_time": "1697322722", + "start_timestamp": "2023-10-15 06:30:50", + "end_timestamp": "2023-10-15 06:32:02", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 179\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 26\n \n # Number of rows to insert\n num_rows = 66\n \n # Size of each column (in characters)\n column_size = 55\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analysis system, a large amount of data generated by 179 sensors needs to be inserted into the database simultaneously. Simulate the database exception caused by this process, where the database table contains 26 columns, 66 rows, and each column has a size of 55 characters.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'IoTDataDB' that is specifically designed for collecting and analyzing sensor data from various IoT devices. This database serves as a central storage for the data generated by 66 sensors. Each sensor generates data at a high frequency and sends it to the database. The 'IoTDataDB' database includes a key table named 'SensorReadings' that stores the sensor data. This table consists of 66 rows of data, with each row representing a reading from one sensor. Each row contains 26 columns, including sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, sensor location, and status information. On a specific day, there are 179 threads (or devices) simultaneously trying to insert sensor data into the 'SensorReadings' table. Due to the high concurrency and the lack of optimization measures such as data partitioning, buffering, or indexing, these concurrent write operations could lead to performance issues and anomalies in the database. This could result in delayed or failed insertions of sensor data, impacting the reliability and efficiency of the IoT system.\n" + }, + "178": { + "start_time": "1697322782", + "end_time": "1697322843", + "start_timestamp": "2023-10-15 06:33:02", + "end_timestamp": "2023-10-15 06:34:03", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 119\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 92\n \n # Number of rows to insert\n num_rows = 271\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a mobile gaming platform, 119 players are simultaneously trying to update their game progress in a database table containing 92 columns and 271 rows of player information, with each column having a size of 52 characters. Due to the high number of players competing for locks on the database table, a database exception is triggered during the update process.\n", + "desc": "In a database called 'LifeSceneDB' that is used in a life scenario, there is a table named 'LifeRecords' that stores various types of life data. This table contains 271 rows of data, each representing a life event, and has 92 columns with each column containing information with a size of 52 characters. These columns may include event ID, event type (such as birth, marriage, death), event date and time, location, people involved, and additional details about the event. In this specific scenario, 119 users are simultaneously accessing and updating the 'LifeRecords' table, which can cause contention for the database locks. Due to the way the database handles locks, multiple users trying to access or update the same rows simultaneously can result in locked resources and delays in processing other users' requests. This lock contention can affect the overall performance and efficiency of the life scenario database, potentially leading to delayed or failed operations.\n" + }, + "179": { + "start_time": "1697322903", + "end_time": "1697322981", + "start_timestamp": "2023-10-15 06:35:03", + "end_timestamp": "2023-10-15 06:36:21", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 166\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 15\n \n # Number of rows to insert\n num_rows = 2207549\n \n # Size of each column (in characters)\n column_size = 55\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an e-commerce platform's database, multiple users are searching for products using various criteria such as product name, category, and price range. The database table being searched contains 15 columns and 2,207,549 rows, with each column size being 55 characters. However, there is an exception in the database caused by a simultaneous search performed by 166 users after a large-scale data cleaning operation called \"VACUUM\".\n", + "desc": "In an e-commerce environment, suppose there is a database called 'ECommerceDB' that stores information about various products. Specifically, there is a table named 'ProductDetails' that contains data for over 2.2 million products. This table has 15 columns, each with a size of 55 characters. The columns could include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, and rating. At a certain point, there is a need to optimize the database performance, possibly due to business requirements such as data cleansing, product updates, or storage capacity limitations. In such a case, the database administrator decides to execute a VACUUM operation on the 'ProductDetails' table. The VACUUM operation is a common database maintenance command that reclaims unused space and reduces data fragmentation within the table. By performing this operation, the size of the table can be optimized, improving overall database performance.In this specific scenario, the VACUUM operation is triggered with 166 parallel threads, which indicates concurrent execution of the operation. This allows for faster processing and optimization of the table. However, it's important to note that the impact of the VACUUM operation on database performance will depend on various factors such as the hardware resources, the current database workload, and the size and fragmentation of the table. Proper planning and scheduling of the VACUUM operation can help avoid potential anomalies and ensure the smooth functioning of the e-commerce platform.\n" + }, + "180": { + "start_time": "1697323041", + "end_time": "1697323154", + "start_timestamp": "2023-10-15 06:37:21", + "end_timestamp": "2023-10-15 06:39:14", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 95\n \n # Number of rows to insert\n num_rows = 851284\n \n # Size of each column (in characters)\n column_size = 76\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial system, 8 users are simultaneously querying a database table with 95 columns and 851,284 rows, each column size being 76 characters. Initially, a large number of indexes are created for items such as account number, transaction type, and amount, but these indexes are deleted after the query operation. The purpose of this scenario is to simulate the additional storage footprint and performance overhead caused by the creation and deletion of redundant indexes in the system.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'SensorDataDB' used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in the database is 'SensorReadings', which contains information about 851,284 sensor readings. Each reading represents a specific measurement from a sensor and includes data such as sensor ID, reading type, reading value, timestamp, sensor location, and status. Additionally, this table has a total of 95 columns, each with a maximum size of 76 characters. In this specific scenario, the database experiences redundant index creation due to the need for query acceleration. For example, when performing data analysis or statistical calculations, indexes may be created on various columns such as reading type, timestamp, or sensor location. These indexes help improve query performance by speeding up the retrieval of relevant data. However, if too many indexes are created or if the indexes are not properly managed, it can result in redundant index creation.When executing the given script, the database system encounters the issue of redundant index creation. This can impact database performance and storage efficiency. Redundant indexes consume additional storage space and can also slow down write operations, as the database needs to update multiple indexes for each write operation.\n" + }, + "181": { + "start_time": "1697323214", + "end_time": "1697323304", + "start_timestamp": "2023-10-15 06:40:14", + "end_timestamp": "2023-10-15 06:41:44", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are uploading, downloading, or editing files at the same time, the system experiences contention for input/output resources. This leads to slower file transfers and overall system performance.\n", + "desc": "Imagine a file sharing system used by teams or organizations for sharing files. The system not only stores the files but also records metadata such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users are simultaneously uploading, downloading, or editing files. This includes large files such as presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the system faces challenges of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may be strained. This can result in slower file transfer speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Additionally, frequent write operations in the database (such as file uploads and metadata updates) can impact database performance. During peak usage periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "182": { + "start_time": "1697323364", + "end_time": "1697323425", + "start_timestamp": "2023-10-15 06:42:44", + "end_timestamp": "2023-10-15 06:43:45", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system experiencing poor join performance and CPU contention, the script \"anomaly_trigger/main.py\" is used to simulate a scenario where multiple users simultaneously perform join operations on a database with high CPU usage. This can result in slower query performance due to the competition for CPU resources among the users.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB' that stores and analyzes various business data for a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', which contain a large number of rows and columns recording detailed business information. In this scenario, analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. These queries involve joining the 'SalesData' table with the 'CustomerProfiles' table to analyze customer purchasing behavior. Additionally, data from the 'ProductCatalog' and 'MarketTrends' tables are also linked to gain deeper market insights. However, due to the large size of the tables and the complexity of the join operations, the performance of these queries is slow. This is mainly because the join keys in the tables are not effectively indexed, and the queries are not properly optimized. During peak periods, when multiple complex join queries are executed simultaneously, there is competition for CPU resources, further reducing query efficiency. This CPU contention occurs when there are too many compute-intensive queries running on the database server or when the server's CPU resources are insufficient to handle the workload.\n" + }, + "183": { + "start_time": "1697323485", + "end_time": "1697323631", + "start_timestamp": "2023-10-15 06:44:45", + "end_timestamp": "2023-10-15 06:47:11", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, there is a scenario where a large amount of data needs to be fetched, specifically the inventory for each product. This involves executing correlated subqueries, which are subqueries that depend on the results of other queries. If these subqueries are not optimized and there are a large number of products, the performance of the inventory query may deteriorate.\n", + "desc": "In an e-commerce database, there is a key table named 'ProductInventory' that stores the inventory information of various products. This table contains data for tens of thousands or even hundreds of thousands of products. Each product's inventory information includes details such as product ID, stock level, last update time, supplier ID, and warehouse location. In this database, there is a common query to determine the total inventory of all products within a specific category. This query involves selecting all products of a particular category from the 'ProductDetails' table and performing subqueries on the 'ProductInventory' table to obtain the inventory data for these products. However, when there is a large number of products in a category, the performance of these subqueries can become inefficient. Executing individual subqueries for each product to retrieve inventory information can be time-consuming, leading to potential I/O bottlenecks as the database needs to read a significant amount of data from the disk.\n" + }, + "184": { + "start_time": "1697323692", + "end_time": "1697323764", + "start_timestamp": "2023-10-15 06:48:12", + "end_timestamp": "2023-10-15 06:49:24", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 102\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 67\n \n # Size of each column (in characters)\n column_size = 27\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, if there are 102 devices simultaneously sending large amounts of data to the database, each device having 9 data points with a size of 27 characters, and a total of 67 data entries, the database may experience slow insertion and potential exceptions due to the high volume of data being inserted simultaneously.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine there is a database named 'IoTDataDB' that is responsible for storing and processing data collected from various IoT devices. This database is designed to handle a large volume of data from sensors such as temperature, humidity, pressure, light, motion, and more. In this particular scenario, the database encounters an anomaly triggered by a large-scale data insertion operation. This operation involves inserting a significant amount of sensor data into the 'SensorData' table within the database. The 'SensorData' table consists of 67 rows of data, each representing a single data reading from a sensor, with a total of 9 columns, each column containing data of up to 27 characters. These columns may include sensor ID, data type, data value, timestamp, location, and more. However, due to the high number of concurrent insertion requests (102 threads), the database's performance is impacted. The system struggles to efficiently process and index these large amounts of data, resulting in delays and decreased overall performance in the data insertion process. This anomaly highlights the importance of optimizing the data insertion process and implementing effective indexing strategies to ensure the smooth operation of the IoT data management system.\n" + }, + "185": { + "start_time": "1697323824", + "end_time": "1697323895", + "start_timestamp": "2023-10-15 06:50:24", + "end_timestamp": "2023-10-15 06:51:35", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 102\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 75\n \n # Size of each column (in characters)\n column_size = 82\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analytics system, 102 threads are simultaneously inserting a large amount of data into a database table. The table has 20 columns, with each column having a size of 82 characters, and a total of 75 rows. This simulation aims to trigger a database exception caused by the high volume of data being inserted.\n", + "desc": "In an e-commerce scenario, there is a database specifically used for storing and managing product data for an online store. This database, named 'ProductDataDB', consists of a table called 'ProductInformation' that contains detailed information about various products. This table contains 75 rows of data, each representing a product entry. Each row has a total of 20 columns, each with a size of 82 characters. These columns store information such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status.In a situation where 102 concurrent users are performing large-scale data insertion operations, such as adding new products to the database, the database might encounter performance issues. This is caused by the lack of efficient data buffering and indexing, making it difficult for the database to handle such a high number of simultaneous insert operations efficiently. As a result, the database may experience delays in processing these insertions, leading to anomalies.These anomalies can affect the overall performance and functionality of the e-commerce platform, potentially impacting user experience and the reputation of the online store.\n" + }, + "186": { + "start_time": "1697323955", + "end_time": "1697324015", + "start_timestamp": "2023-10-15 06:52:35", + "end_timestamp": "2023-10-15 06:53:35", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 89\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 53\n \n # Number of rows to insert\n num_rows = 211\n \n # Size of each column (in characters)\n column_size = 93\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, data is being updated by multiple users at the same time. There are 89 users simultaneously performing a frequent update operation in a database table containing 53 columns and 211 rows of product records. Each column has a size of 93 characters. The multiple users are competing with each other to lock the database table and perform the update operation. This simulates a database exception caused by contention for locking the table.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used to collect and analyze data from various sensors, called 'SensorDataDB'. This database is designed to handle a large volume of data from multiple types of sensors. One of the key tables in the database is called 'SensorReadings', which stores data from 211 sensors. This table contains 53 columns, each with a maximum size of 93 characters, to store information such as sensor ID, reading type, reading value, timestamp, sensor location, and status. In this scenario, due to the high frequency at which the sensors are transmitting data, the database might face performance issues. This could be caused by factors such as improper indexing, limited buffering mechanisms, or insufficient data partitioning in the 'SensorReadings' table. These limitations can impact the database's ability to handle the large number of concurrent write requests. Consequently, the database's write latency may increase, leading to anomalies and potentially affecting other database operations.\n" + }, + "187": { + "start_time": "1697324075", + "end_time": "1697324145", + "start_timestamp": "2023-10-15 06:54:35", + "end_timestamp": "2023-10-15 06:55:45", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 89\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 3648387\n \n # Size of each column (in characters)\n column_size = 82\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of an online marketplace, when 89 users simultaneously search for products using terms such as product name, category, and price range, after performing a large-scale data cleaning operation on a table containing 9 columns, 3,648,387 rows, with each column having a size of 82 characters, an exception may occur due to the high workload and lack of optimization in the search process.\n", + "desc": "In the database of an e-commerce platform, there is a database used for an online store named 'OnlineStoreDB'. This database contains a key table named 'ProductRecords', which stores detailed information about products. This table consists of 3,648,387 rows of data, each representing a unique product entry. The table has 9 columns, with each column having a size of 82 characters. The columns include product ID, name, price, stock quantity, description, brand, category, size, and color. In this particular scenario, the database administrator needs to perform a vacuum operation on the database. The vacuum operation is a data maintenance process that reclaims unused space in the database, allowing for better performance and efficiency. The vacuum process involves scanning the entire table and rearranging the data to optimize storage and remove any fragmented or unused space.Since the 'ProductRecords' table is quite large, with millions of rows, and each row has multiple columns, performing the vacuum operation might take a significant amount of time and resources. If the vacuum operation is not properly managed or scheduled during low-traffic periods, it could potentially impact the performance of the database and other concurrent operations. However, when executed correctly, the vacuum operation can help maintain the overall health of the database and ensure its long-term efficiency.\n" + }, + "188": { + "start_time": "1697324205", + "end_time": "1697324319", + "start_timestamp": "2023-10-15 06:56:45", + "end_timestamp": "2023-10-15 06:58:39", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 78\n \n # Number of rows to insert\n num_rows = 485208\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database with 78 columns and 485,208 rows, each with a column size of 66 characters, a large number of indexes are created for items such as product name, category, and price range at the beginning of the query, followed by a query of 6 users. The purpose is to simulate the additional storage footprint and performance overhead caused by this process.\n", + "desc": "In a business intelligence scenario, particularly involving the financial data of large companies, there is a database called 'FinanceDataDB'. This database is designed to store and analyze financial records and data of various corporations. It contains multiple tables, with one key table called 'FinanceRecords' that stores detailed information about financial transactions, accounting entries, and other financial data. This 'FinanceRecords' table consists of 485,208 rows of data, each row representing a financial record, and has 78 columns, each containing information up to 66 characters long. These columns may include transaction ID, transaction type (such as income, expenditure, assets, and liabilities), amount, date, department, project code, budget code, financial year, audit status, and more.In a typical business intelligence environment, it is common to create and optimize indexes to improve the speed and performance of complex financial queries. However, if unnecessary or redundant indexes are created without proper planning or evaluation, it can lead to various issues. These redundant indexes can consume additional storage space, impact the efficiency of data insertion and updates, and degrade overall database performance. Furthermore, managing and maintaining a large number of indexes can also increase the complexity of database management tasks.Considering the given command `python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX --threads 6 --ncolumn 78 --colsize 66 --nrow 485208`, it suggests running a script that triggers the scenario of redundant index creation in the 'FinanceDataDB' database. This script simulates the creation of unnecessary indexes on the 'FinanceRecords' table, which has 485,208 rows and 78 columns, for example, in response to financial queries. The script includes 6 threads to simulate concurrent index creation operations.By executing this script, it tests the database's ability to handle such index operations and examines the impact of redundant index creation on overall database performance. This scenario helps identify any performance degradation caused by excessive indexes and allows for optimization or removal of redundant indexes to improve query performance and database efficiency.\n" + }, + "189": { + "start_time": "1697324379", + "end_time": "1697324470", + "start_timestamp": "2023-10-15 06:59:39", + "end_timestamp": "2023-10-15 07:01:10", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a shared document platform, multiple users are simultaneously uploading, downloading, or editing files, causing contention for I/O resources. This leads to slower file transfer speeds.\n", + "desc": "In a file sharing system scenario, there is a database called 'FileShareDB' that is used by teams or organizations to share files. This database stores the actual files as well as metadata about the files, such as information about the uploader, file size, creation and modification dates, version history, access permissions, and number of downloads. Throughout the day, multiple users may be simultaneously uploading, downloading, or editing files. For example, a project team might be collaborating on a report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. The system might also be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency of file operations, the 'FileShareDB' database faces challenges related to input/output (I/O) contention. When multiple users are simultaneously uploading or downloading large files, the system's storage and network bandwidth may be significantly strained. This I/O contention can lead to slower file transfer speeds, especially in situations where there is limited bandwidth or the servers do not have sufficient processing capabilities. Additionally, frequent write operations to the database, such as file uploads and metadata updates, can impact the overall performance of the database. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "190": { + "start_time": "1697324530", + "end_time": "1697324590", + "start_timestamp": "2023-10-15 07:02:10", + "end_timestamp": "2023-10-15 07:03:10", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics company, several data analysts are performing join queries on large datasets using the Python script \"anomaly_trigger/main.py\". The join performance is poor due to inefficient join algorithms and the lack of proper indexing on the join columns. Additionally, there is CPU contention as multiple analysts are running their queries simultaneously, competing for CPU resources. This leads to slow query execution and overall poor performance in analyzing the data.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'BusinessIntelligenceDB', which is used by a company to store and analyze various business data. This database contains multiple tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. These reports involve joining the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. Additionally, they might also need to link these data with 'ProductCatalog' and 'MarketTrends' to gain deeper market insights.However, due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can be slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there might be competition for CPU resources, further reducing query efficiency. This CPU contention could occur due to too many compute-intensive queries running on the database server or insufficient CPU resources on the server.\n" + }, + "191": { + "start_time": "1697324650", + "end_time": "1697324798", + "start_timestamp": "2023-10-15 07:04:10", + "end_timestamp": "2023-10-15 07:06:38", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, when fetching large amounts of data, specifically inventory information for each product, the execution of correlated subqueries is required. The performance of this query may be affected if the subqueries are not optimized properly.\n", + "desc": "In an e-commerce platform, there is a database named 'ECommerceDB' that contains a crucial table called 'ProductInventory', which stores inventory information for various products. This table holds data for tens of thousands or even hundreds of thousands of products, including their product IDs, current stock levels, last inventory update times, supplier IDs, warehouse locations, and other details. When querying the inventory level of each product, the database needs to perform related subqueries. For example, a common query might involve selecting all products of a specific category from the 'ProductDetails' table and then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. However, when the number of products is large, performing individual subqueries for each product becomes inefficient and time-consuming. This is because the database needs to read a significant amount of data from the disk to retrieve inventory information for a large number of products, which can result in I/O bottlenecks.\n" + }, + "192": { + "start_time": "1697324859", + "end_time": "1697324931", + "start_timestamp": "2023-10-15 07:07:39", + "end_timestamp": "2023-10-15 07:08:51", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 144\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 61\n \n # Size of each column (in characters)\n column_size = 33\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home automation system, there are 144 devices generating a large amount of data. This data needs to be inserted into the database simultaneously. However, the database is only configured to handle 10 columns, each with a size of 33 characters, and has a table with 61 rows. This simulation triggers a database exception due to the overload caused by the insertion of such large data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that is used for collecting and analyzing data from various types of sensors. This database contains a key table named 'SensorReadings' which stores data from 61 sensors. Each row in this table represents a reading from a sensor, and there are 10 columns in total, each capable of storing up to 33 characters of information. These columns might include sensor ID, reading type (such as temperature, humidity, pressure), reading value, timestamp, sensor location, and status information.At a certain point in time, all 144 sensors start transmitting data simultaneously and at a high frequency. This influx of data can pose a challenge to the database's performance. If the database is not properly tuned or optimized to handle such a large volume of concurrent write requests, it can lead to difficulties in processing and storing the incoming data efficiently. This can result in increased write latency and performance degradation in the database, ultimately leading to anomalies.\n" + }, + "193": { + "start_time": "1697324991", + "end_time": "1697325063", + "start_timestamp": "2023-10-15 07:09:51", + "end_timestamp": "2023-10-15 07:11:03", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 144\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 28\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 83\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a collaborative project management tool, 144 users simultaneously attempt to insert a large amount of data into a database table with 28 columns and 68 rows of records. Each column has a size of 83 characters. This simulates a database exception caused by the high volume of data being inserted.\n", + "desc": "In an e-commerce setting, there is a database used for an online store called 'OnlineStoreDB'. This database contains a key table named 'ProductRecords' that keeps track of detailed information about products. The table consists of 68 rows, each representing an individual product entry, with a total of 28 columns, where each column can store up to 83 characters. These columns may include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes. To accommodate business needs, such as updating product lines or clearing out-of-date products, a large-scale insertion operation needs to be performed. During this operation, the database administrator needs to insert a large amount of data into the 'ProductRecords' table. However, if appropriate pre-processing and optimization measures are not taken, such as incrementally inserting data or performing the operation during low traffic periods, this large-scale insertion could potentially trigger anomalies. Specifically, in a high-transaction e-commerce environment, these anomalies could impact other database operations, such as price updates or user queries, thereby affecting the overall operations of the online store.\n" + }, + "194": { + "start_time": "1697325123", + "end_time": "1697325183", + "start_timestamp": "2023-10-15 07:12:03", + "end_timestamp": "2023-10-15 07:13:03", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 56\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 69\n \n # Number of rows to insert\n num_rows = 231\n \n # Size of each column (in characters)\n column_size = 62\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for a medical research organization, 56 researchers simultaneously attempt to update a database table containing 69 columns and 231 rows of patient records. Each column has a size of 62 characters. There is a high level of contention between the researchers, causing the database table to be frequently locked and resulting in a slowdown in the update process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data, specifically from smart devices in a smart home environment. This database, called 'SmartHomeDB', contains a table named 'SensorData', which stores information from various sensors, such as temperature, humidity, motion, light, and more. Each row in this table represents a data reading from a sensor, and there are a total of 231 rows. The table has 69 columns, each with a size of 62 characters, capturing different attributes of the sensor data. These attributes may include sensor ID, sensor type, location, timestamp, sensor value, and status.In this scenario, 56 sensors are actively transmitting data to the 'SensorData' table at the same time. Due to the high concurrency of these write operations, there can be contention for database locks when multiple sensors try to update the table simultaneously. This lock contention can result in delayed processing or even failure of some data insertion operations, impacting the real-time monitoring and control capabilities of the smart home system. Additionally, if this lock contention occurs frequently, it can lead to increased database write latency and potentially affect the overall performance of the smart home IoT system.\n" + }, + "195": { + "start_time": "1697325243", + "end_time": "1697325353", + "start_timestamp": "2023-10-15 07:14:03", + "end_timestamp": "2023-10-15 07:15:53", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 61\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2097306\n \n # Size of each column (in characters)\n column_size = 83\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online retail database with 10 columns and 2,097,306 rows of product records, each column having a size of 83 characters, a large number of users (61 in this case) perform searches on the database after a vacuum operation, which is a process of reclaiming storage space and optimizing performance. This simulates the scenario where multiple users search for products simultaneously, after a cleanup operation, leading to potential database exceptions.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' that stores information about products sold on an online platform. Within this database, there is a table named 'ProductInfo' that contains detailed information about the products, such as product ID, name, price, stock quantity, description, brand, category, and other attributes. This table has 2,097,306 rows of data, with each row representing a unique product entry. Additionally, there are 10 columns in this table, each capable of storing up to 83 characters of information.Sometimes, the database administrator needs to perform a 'vacuum' operation on the 'ProductInfo' table. This operation involves optimizing the table for better performance by reclaiming unused space, reorganizing data, and updating statistics. However, performing this 'vacuum' operation on a large table with millions of rows can be resource-intensive and time-consuming.\n" + }, + "196": { + "start_time": "1697325413", + "end_time": "1697325528", + "start_timestamp": "2023-10-15 07:16:53", + "end_timestamp": "2023-10-15 07:18:48", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 71\n \n # Number of rows to insert\n num_rows = 819314\n \n # Size of each column (in characters)\n column_size = 75\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace database, there are 10 users querying a table with 71 columns and 819,314 rows, where each column has a size of 75 characters. Initially, there are redundant indexes created for columns such as product name, category, and price range. These indexes are later removed after the query operation. This simulates the impact of additional storage space and performance overhead caused by redundant indexes in a real-life scenario.\n", + "desc": "In an e-commerce scenario, suppose there is a database named 'ECommerceDB', which serves as the foundation for an online store. This database contains a key table named 'ProductRecords', which stores detailed information about various products. The 'ProductRecords' table contains a vast amount of data, with 819,314 rows, each representing a unique product entry. Each row consists of 71 columns, with each column capable of holding up to 75 characters. These columns may include information such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other relevant attributes.In this scenario, due to the complexity of the business intelligence analysis processes in the e-commerce domain, the administrator may attempt to improve query performance by creating redundant indexes. These indexes could be designed to accelerate specific types of queries, such as category-based searches, price range filtering, or brand-specific queries. However, creating a large number of redundant indexes can have implications for the database.When multiple users simultaneously execute queries that rely on these indexes, the database's performance might be negatively impacted. The presence of redundant indexes can lead to increased storage usage, slower data insertion, deletion, and update operations, and potential fragmentation issues. This can result in delayed query execution, increased resource consumption, and reduced overall efficiency in accessing and manipulating the product records.\n" + }, + "197": { + "start_time": "1697325588", + "end_time": "1697325679", + "start_timestamp": "2023-10-15 07:19:48", + "end_timestamp": "2023-10-15 07:21:19", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a cloud storage service, multiple users are simultaneously uploading, downloading, or editing large files, causing contention in the I/O operations. This results in slower file transfer speeds and potential performance issues.\n", + "desc": "In an Internet of Things (IoT) scenario involving a smart home system, there is a database named 'SmartHomeDB' which is responsible for storing and managing data from various smart devices in a household. This database contains a table named 'DeviceReadings' which records the readings and sensor data from these devices. Each row in this table represents a reading from a specific device and includes information such as device ID, sensor type, reading value, timestamp, and device status. Suppose that at a specific time, multiple devices in the smart home start sending data updates to the 'DeviceReadings' table simultaneously, resulting in a large influx of data. The increased volume of data being written to the database can cause I/O contention. This means that the system's storage and network resources may be overwhelmed and unable to handle the high rate of incoming data. As a result, the performance of the database may be negatively impacted, leading to slower write speeds and potential delays or failures in recording the incoming data. This I/O contention can also affect other operations and services in the smart home system, further impacting the overall performance and user experience.\n" + }, + "198": { + "start_time": "1697325739", + "end_time": "1697325800", + "start_timestamp": "2023-10-15 07:22:19", + "end_timestamp": "2023-10-15 07:23:20", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a software development company, multiple developers are performing a join operation on two large database tables to merge data. This join operation is causing poor performance due to the lack of optimization, which is putting a heavy load on the CPU and causing contention among the developers.\n", + "desc": "In a business intelligence scenario, there is a database called 'CorporateAnalyticsDB' that is used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', which record detailed business information. In this specific scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with the 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights. However, due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there can be competition for CPU resources, further reducing the query efficiency. This CPU contention might occur because there are too many compute-intensive queries running on the database server or because the server's CPU resources are insufficient to handle these queries.\n" + }, + "199": { + "start_time": "1697325860", + "end_time": "1697326000", + "start_timestamp": "2023-10-15 07:24:20", + "end_timestamp": "2023-10-15 07:26:40", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online marketplace, when retrieving data for a large number of products and executing related subqueries to find inventory information, the performance of the query may suffer if the subqueries are not optimized.\n", + "desc": "In an e-commerce platform, there is a database called 'ECommerceDB' that contains a table called 'ProductInventory'. This table stores inventory information for various products, including the product ID, current stock level, last inventory update time, supplier ID, and warehouse location. When querying the inventory level of each product, the database might need to perform correlated subqueries. For example, it might first select products of a specific category from the 'ProductDetails' table and then perform subqueries on the 'ProductInventory' table to retrieve the inventory data for these products. However, when dealing with a large number of products, these correlated subqueries can be inefficient and time-consuming. This is because each subquery needs to be executed individually for each product, resulting in a significant amount of data being read from the disk and potentially leading to I/O bottlenecks.\n" + }, + "200": { + "start_time": "1697326060", + "end_time": "1697326133", + "start_timestamp": "2023-10-15 07:27:40", + "end_timestamp": "2023-10-15 07:28:53", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 179\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 89\n \n # Size of each column (in characters)\n column_size = 43\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, if there are 179 devices trying to send a large amount of data to the central database simultaneously, each device having 10 different types of data to be stored in a table with 43 characters per column, and a total of 89 records, this process may cause a database exception due to the high data load.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used for collecting and analyzing sensor data in a smart home environment. This database, called 'SmartHomeDB', is responsible for storing data from various sensors installed in the home, such as temperature sensors, motion sensors, and light sensors. The main table in this database is named 'SensorReadings', which records detailed information about the sensor readings. This table consists of 89 rows, each representing a sensor reading, with 10 columns, each containing information of up to 43 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this particular scenario, a large number of sensor devices are connected to the system, and each device is continuously sending sensor readings at a high frequency. These sensor readings could include temperature values, motion detection events, or light intensity measurements. Due to the high volume of incoming data, the database might encounter issues in handling these large-scale write operations. Inefficient data insertion processes, the lack of appropriate data buffering mechanisms, or inadequate indexing strategies can lead to performance bottlenecks. These bottlenecks can result in increased write latency and resource contention within the database, ultimately leading to anomalies in the system.\n" + }, + "201": { + "start_time": "1697326193", + "end_time": "1697326265", + "start_timestamp": "2023-10-15 07:29:53", + "end_timestamp": "2023-10-15 07:31:05", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 179\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 39\n \n # Number of rows to insert\n num_rows = 70\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an internet of things (IoT) application, a large amount of data generated by 179 sensors needs to be inserted into the database simultaneously. The database table contains 39 columns, each with a column size of 52 characters, and a total of 70 rows of data. This is simulated to trigger a database exception.\n", + "desc": "In an e-commerce database, there is a table named 'ProductDetails' that stores information about various products. This table contains 70 rows of data, each representing a product, with a total of 39 columns. These columns store information such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. In this scenario, the database administrator needs to insert a large amount of data into the 'ProductDetails' table. The insertion operation is performed by 179 threads, and each column can store up to 52 characters of data. This insertion of a large volume of data can put a strain on the database's performance and resources, potentially leading to anomalies.\n" + }, + "202": { + "start_time": "1697326325", + "end_time": "1697326385", + "start_timestamp": "2023-10-15 07:32:05", + "end_timestamp": "2023-10-15 07:33:05", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 76\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 99\n \n # Number of rows to insert\n num_rows = 213\n \n # Size of each column (in characters)\n column_size = 84\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the scenario of a database for an e-commerce platform, 76 users try to perform simultaneous update operations on a database table containing 99 columns and 213 rows of records, with each column having a size of 84 characters. These users compete with each other to lock the database table, causing a database exception.\n", + "desc": "In a banking scenario, there is a database named 'BankingDB' that stores customer and transaction data. One of the key tables in this database is 'AccountTransactions', which records information about various banking transactions. This table contains 213 rows of data, each representing a transaction record, with a total of 99 columns, each containing information of up to 84 characters. These columns include transaction ID, account number, transaction type, transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more. In this specific situation, 76 users simultaneously attempt to perform transaction-related operations, such as updating transaction statuses, modifying transaction amounts, or adding transaction notes. Due to the database's locking mechanism and the high concurrency of these operations, there is a competition for locking the 'AccountTransactions' table. This contention for locks can lead to performance issues in the database. During busy hours, such locking-related issues can result in delays or failures in processing other users' transaction requests, affecting the daily operations of the bank. Additionally, this frequent locking could cause the database's transaction log to rapidly grow, leading to storage space consumption and potential interruptions in database services.\n" + }, + "203": { + "start_time": "1697326445", + "end_time": "1697326506", + "start_timestamp": "2023-10-15 07:34:05", + "end_timestamp": "2023-10-15 07:35:06", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 184\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2735363\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for an online store, when 184 users simultaneously perform search operations after a large-scale data cleaning operation on a table with 10 columns, 2,735,363 rows, and each column containing 52 characters of product records, an exception may occur due to the increased workload and lack of optimization.\n", + "desc": "In an e-commerce scenario, there is a database used by an online store named 'OnlineStoreDB'. This database contains a key table called 'ProductRecords', which stores detailed information about products. The table consists of 2,735,363 rows, each representing a different product, and has 10 columns, each containing information of up to 52 characters. These columns may include product ID, name, price, stock quantity, description, brand, category, size, color, and weight.The anomaly being triggered is 'VACUUM'. This refers to a large-scale data cleanup operation that needs to be performed in the database. This could be due to reasons such as removing out-of-date products, updating the product line, or consolidating data. The operation involves deleting a significant number of product records from the 'ProductRecords' table. Since the deletion operation is done on a massive scale, it can potentially impact the performance of the database.Without appropriate pre-processing and optimization measures, such as incremental deletion, batch processing, or performing operations during low traffic periods, such large-scale deletions could trigger anomalies. These anomalies can affect other database operations, such as inserting new products, updating prices, or handling user queries, which can in turn impact the operations of the entire online store.\n" + }, + "204": { + "start_time": "1697326566", + "end_time": "1697326680", + "start_timestamp": "2023-10-15 07:36:06", + "end_timestamp": "2023-10-15 07:38:00", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 7\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 74\n \n # Number of rows to insert\n num_rows = 890431\n \n # Size of each column (in characters)\n column_size = 55\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an inventory management system for a retail store, 7 users simultaneously perform a search operation on a database table containing 74 columns and 890,431 rows of product records. Each column has a size of 55 characters. However, a large number of unnecessary indexes are created for attributes like product name, category, and price range in the beginning of the query, which introduces additional storage overhead and slows down the search operation.\n", + "desc": "In the business intelligence scenario, there is a database called 'BusinessIntelligenceDB' that stores and analyzes data for a company. This database contains multiple tables, one of which is the 'DataAnalysis' table. The 'DataAnalysis' table contains a large amount of data, with 890,431 rows and 74 columns, each column containing information up to 55 characters. The columns in this table represent various data attributes, such as sales data, market trends, customer profiles, and product catalog information.When performing complex data analysis tasks, such as generating reports or conducting trend analysis, the database administrator might create redundant indexes on the 'DataAnalysis' table to improve query performance. These indexes could be based on different data attributes, such as date, sales region, product category, or customer segment. However, if too many redundant indexes are created or if the indexes are not properly managed, it can lead to negative consequences.For example, excessive indexes can consume additional storage space and impact write performance when new data is inserted, updated, or deleted. Additionally, frequent updates to the table (such as inserting or deleting rows) can result in index fragmentation, leading to decreased query performance. This can especially become an issue during peak periods when multiple users are accessing and querying the database simultaneously.Therefore, it is crucial for the database administrator to carefully evaluate the need for indexes and ensure that they are effectively managed to avoid the potential negative impact on the database's performance and user experience.\n" + }, + "205": { + "start_time": "1697326740", + "end_time": "1697326831", + "start_timestamp": "2023-10-15 07:39:00", + "end_timestamp": "2023-10-15 07:40:31", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a collaborative document editing system, multiple users are uploading, downloading, or editing documents simultaneously. The system experiences contention with input/output (I/O) operations, leading to slower file transfers.\n", + "desc": "In an IoT scenario, imagine a database named 'IoTDataDB' that stores and analyzes sensor data collected from various devices. This database contains a key table named 'SensorReadings' that records information about sensor readings. Each row in this table represents a reading from a specific sensor and includes fields such as sensor ID, reading type, reading value, timestamp, location, and device ID. The database is designed to handle a large volume of data from multiple sensors. In this scenario, multiple sensors might simultaneously transmit data at a high frequency, causing the database to face performance issues related to I/O contention. Due to the high volume of incoming data, the system's storage and network bandwidth might become strained. This can result in slower data processing and storage operations, especially if the server's processing capabilities or bandwidth are limited. Additionally, frequent write operations in the database, such as storing sensor readings, can impact the overall performance of the system. During peak periods of data transmission, the database might encounter locking and transaction management issues, further slowing down the processing of sensor data.\n" + }, + "206": { + "start_time": "1697326891", + "end_time": "1697326951", + "start_timestamp": "2023-10-15 07:41:31", + "end_timestamp": "2023-10-15 07:42:31", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis scenario, multiple users are performing join operations on large datasets using a Python script. The join operations are not optimized, leading to poor performance. Additionally, these operations are causing CPU contention, resulting in slower processing times and resource conflicts.\n", + "desc": "In a Business Intelligence (BI) scenario, imagine a database named 'CorporateAnalyticsDB', used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights.Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources.During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "207": { + "start_time": "1697327011", + "end_time": "1697327158", + "start_timestamp": "2023-10-15 07:43:31", + "end_timestamp": "2023-10-15 07:45:58", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, there is a query to fetch a large amount of data involving correlated subqueries. This query retrieves the inventory of each product, but if the subqueries are not optimized, the performance of the query may deteriorate.\n", + "desc": "In an e-commerce database named 'ECommerceDB', there is a table called 'ProductInventory' that stores inventory information for various products. This table contains data for tens of thousands or even hundreds of thousands of products, including the product ID, stock level, last update time, supplier ID, and warehouse location. In this database, there is a need to perform queries that involve related subqueries, such as determining the total inventory of all products in a specific category. These queries involve selecting products from the 'ProductDetails' table based on the category and then performing subqueries on the 'ProductInventory' table to retrieve the inventory data. However, when the number of products is large, executing individual subqueries for each product can be slow and inefficient. This is because retrieving inventory information for a large number of products requires reading a significant amount of data from the disk, which can cause I/O bottlenecks and impact query performance.\n" + }, + "208": { + "start_time": "1697327219", + "end_time": "1697327291", + "start_timestamp": "2023-10-15 07:46:59", + "end_timestamp": "2023-10-15 07:48:11", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 184\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 77\n \n # Size of each column (in characters)\n column_size = 27\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application where 184 sensors generate a large amount of data, there is a need to insert this data into the database. The database table has 5 columns, each with a size of 27 characters, and contains 77 rows. This process simulates a database exception caused by simultaneously inserting a large amount of data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' used to store and manage data collected from various IoT devices. This database contains a key table called 'DeviceData' which records detailed information about the data collected from IoT devices. This table consists of 77 rows of data, each representing a data record from a device, with a total of 5 columns, each containing information of up to 27 characters. These columns may include device ID, data type (such as temperature, humidity, pressure, etc.), data value, timestamp, and device location.In this particular scenario, there is a need to insert a large amount of data into the 'DeviceData' table. To simulate this situation, the script is executed with the command 'python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA --threads 184 --ncolumn 5 --colsize 27 --nrow 77'. The command specifies that 184 threads are used to insert data into the table, with each thread inserting data into 77 rows. The 'ncolumn' parameter indicates that each row has 5 columns, and the 'colsize' parameter specifies that each column can have up to 27 characters.Inserting such a large amount of data concurrently into the database can put significant strain on the database server's resources, such as CPU and memory. It can also cause contention for database locks and result in increased latency for other database operations. Additionally, the lack of proper data partitioning or indexing strategies could further impact the performance of this insertion operation.\n" + }, + "209": { + "start_time": "1697327351", + "end_time": "1697327423", + "start_timestamp": "2023-10-15 07:49:11", + "end_timestamp": "2023-10-15 07:50:23", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 184\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 29\n \n # Number of rows to insert\n num_rows = 77\n \n # Size of each column (in characters)\n column_size = 89\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a scientific research project, there is a need to insert a large amount of data generated by 184 experiments simultaneously. This data includes 29 different parameters, each with a maximum size of 89 characters. The data is being inserted into a table that has 77 rows. The purpose of this script is to simulate the database exception that may occur during the insertion process.\n", + "desc": "In an e-commerce database named 'ECommerceDB', which stores information about various products, there is a table named 'ProductDetails'. This table contains 77 rows, each representing a product, and has a total of 29 columns, each with a size of up to 89 characters. These columns may include product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. In this scenario, 184 users are simultaneously inserting new product data into the 'ProductDetails' table using the 'INSERT_LARGE_DATA' command. This could be part of a product catalog update or a bulk import operation. However, due to the high number of concurrent insertions and potentially inefficient data processing mechanisms, the database may encounter performance issues. These issues could include slower insertion rates, increased resource usage, or even database locking, which could lead to anomalies in the database. Such anomalies can affect the overall efficiency and stability of the e-commerce platform.\n" + }, + "210": { + "start_time": "1697327483", + "end_time": "1697327544", + "start_timestamp": "2023-10-15 07:51:23", + "end_timestamp": "2023-10-15 07:52:24", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 181\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 92\n \n # Number of rows to insert\n num_rows = 260\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 181 users simultaneously attempt to perform frequent update operations in a database table containing 92 columns and 260 rows of product records, where each column has a size of 52 characters. The users compete with each other to lock the database table to perform the update operation, which leads to contention and potential database exceptions.\n", + "desc": "In a banking scenario, there is a database named 'BankingDB' that is used to handle customer and transaction data. Within this database, there is a key table called 'AccountTransactions' that stores detailed information about various banking transactions. This table contains 260 rows of data, with each row representing a transaction record for an account. The table has a total of 92 columns, with each column containing information of up to 52 characters. These columns include transaction ID, account number, transaction type (such as deposit or withdrawal), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more. In this particular scenario, there are 181 users who are simultaneously attempting to update the same or adjacent rows in the 'AccountTransactions' table. Due to the table's design and the database's locking mechanism, these concurrent update operations are causing contention for locking the database table. This prolonged locking can lead to performance issues in the database, such as delayed processing or failure of other users' transaction requests. If these lock contention incidents occur frequently, they can also result in rapid growth in the database transaction log and temporary interruptions in database services.\n" + }, + "211": { + "start_time": "1697327604", + "end_time": "1697327722", + "start_timestamp": "2023-10-15 07:53:24", + "end_timestamp": "2023-10-15 07:55:22", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 173\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 3788837\n \n # Size of each column (in characters)\n column_size = 89\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a large-scale online platform, when 173 users simultaneously perform searches after a vacuum operation on a database table with 5 columns and 3,788,837 rows, each column's size being 89 characters, an exception in the database may occur due to the increased search load and the absence of necessary indexing.\n", + "desc": "In an e-commerce platform's database, there is a database used for storing and managing product information named 'ProductDB'. This database contains a primary table named 'ProductDetails', which records detailed information about different types of products. This table consists of 3,788,837 rows of data, each representing a specific product, and has a total of 5 columns, with each column capable of storing up to 89 characters. These columns include product ID, name, price, stock quantity, and category. Suppose the database administrator needs to perform a VACUUM operation on the 'ProductDetails' table. This operation involves recovering disk space by removing dead tuples, optimizing table layout, and reorganizing data to improve performance and storage efficiency. In this case, the administrator decides to perform the VACUUM operation with 173 threads in order to expedite the process.However, performing a VACUUM operation on a large table with a high number of threads can put a significant strain on the database server's resources, particularly CPU and I/O. This can lead to high CPU contention and I/O contention, as the server needs to handle multiple concurrent VACUUM operations. If the server's resources are limited or insufficiently allocated, it could result in performance issues and slowdowns in other database operations, such as query processing or data manipulation. Therefore, it is important for the administrator to carefully consider the impact on server resources and plan the VACUUM operation accordingly to prevent anomalies and maintain smooth database operations.\n" + }, + "212": { + "start_time": "1697327782", + "end_time": "1697327897", + "start_timestamp": "2023-10-15 07:56:22", + "end_timestamp": "2023-10-15 07:58:17", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 79\n \n # Number of rows to insert\n num_rows = 732240\n \n # Size of each column (in characters)\n column_size = 81\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace database, there are six users performing a query operation on a database table containing 79 columns and 732,240 rows of product records, each with a column size of 81 characters. At the beginning of the query, a large number of indexes are created for items such as product name, category, and price range. However, after the query operation, these indexes are deleted. This simulation showcases the additional storage footprint and performance overhead caused by creating and deleting redundant indexes.\n", + "desc": "In an e-commerce database scenario, let's consider a database called 'OnlineStoreDB', which is responsible for storing and managing the products available on an online store. This database contains a table called 'ProductCatalog' that holds information about various products. In this case, the 'ProductCatalog' table consists of 732,240 rows of data, with each row representing a different product. The table has a total of 79 columns, each capable of storing information with a size of up to 81 characters. These columns may include details such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, user rating, number of reviews, sales status, promotional information, image link, and other relevant attributes.The anomaly that is being triggered in this case is 'REDUNDANT_INDEX'. In order to improve the performance of queries, the database administrator has created multiple indexes on different columns of the 'ProductCatalog' table. These indexes are intended to accelerate searches and improve query execution time. However, due to the redundancy of these indexes, unnecessary computational resources are being used, leading to increased storage usage and additional performance overhead. This issue can become more pronounced when there are multiple users simultaneously querying the database.By triggering the 'REDUNDANT_INDEX' anomaly, it simulates a situation where 6 users are conducting concurrent queries on the 'ProductCatalog' table. This scenario helps in identifying the impact of redundant indexes on the performance of the database. Specifically, it helps in understanding how the creation and deletion of these indexes during query execution can affect the storage utilization and overall performance of the online store database.\n" + }, + "213": { + "start_time": "1697327957", + "end_time": "1697328048", + "start_timestamp": "2023-10-15 07:59:17", + "end_timestamp": "2023-10-15 08:00:48", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are simultaneously uploading, downloading, or editing files, causing contention for input/output operations. As a result, the file transfer process becomes slower due to the competition for input/output resources.\n", + "desc": "In a file sharing system scenario, we can imagine a database called 'TeamFileShareDB', which is a system used by teams or organizations to share files. This database not only stores the actual files, but also keeps track of the metadata associated with each file, such as the uploader's information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users may be simultaneously uploading, downloading, or editing files. For example, a project team is collaborating to complete an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to such high concurrency in file operations, the 'TeamFileShareDB' database faces challenges related to input/output (I/O) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may become significantly strained. This I/O contention can result in slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "214": { + "start_time": "1697328108", + "end_time": "1697328168", + "start_timestamp": "2023-10-15 08:01:48", + "end_timestamp": "2023-10-15 08:02:48", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a financial reporting system, multiple users are executing join queries to retrieve data from large database tables. The join operations are not optimized, causing poor performance. Additionally, there is contention for CPU resources among the users, resulting in further degradation of query performance.\n", + "desc": "In a business intelligence scenario, imagine a database called 'AnalyticsDB' used for storing and analyzing various business data. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, the company's analysts frequently need to perform complex join queries across these tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (which contains sales records) with the 'CustomerProfiles' table (which contains customer information) to analyze the purchasing behaviors of different customer groups. Additionally, they might also need to link these data with the 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) tables to gain deeper market insights.However, due to the large size of these tables and the involvement of multiple table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. CPU contention may occur due to too many compute-intensive queries running on the database server or the server's CPU resources being insufficient to handle these queries.\n" + }, + "215": { + "start_time": "1697328228", + "end_time": "1697328367", + "start_timestamp": "2023-10-15 08:03:48", + "end_timestamp": "2023-10-15 08:06:07", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online marketplace, when trying to fetch a large amount of data and execute related subqueries to determine the inventory for each product, the performance of the query may deteriorate if the subqueries are not optimized.\n", + "desc": "In an IoT scenario, there is a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the main tables in this database is called 'SensorReadings', which records the readings from multiple sensors. Each reading is associated with a sensor ID, type, value, timestamp, and location. These readings are continuously updated, and the database is frequently queried for analyzing the sensor data and generating reports. However, when conducting complex queries that involve correlated subqueries, fetching large amounts of sensor data can become inefficient. This is especially true when retrieving data for a large number of sensors or when the subqueries take a long time to execute. Due to the complexity and size of the data involved, these types of queries can result in longer query times and potentially consume significant database resources.\n" + }, + "216": { + "start_time": "1697328428", + "end_time": "1697328500", + "start_timestamp": "2023-10-15 08:07:08", + "end_timestamp": "2023-10-15 08:08:20", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 108\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 54\n \n # Size of each column (in characters)\n column_size = 30\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, a large amount of data generated by 108 sensors needs to be inserted into the database simultaneously. This process can cause a database exception. The data is being inserted into a table with 18 columns and 54 rows, and each column can hold up to 30 characters.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database used to store and analyze sensor data from various IoT devices, named 'SensorDataDB'. This database contains a key table named 'SensorReadings', which records data from different types of sensors. This table consists of 54 rows, each representing a reading from a sensor, with a total of 18 columns, each containing information of up to 30 characters. These columns may include sensor ID, reading type, reading value, timestamp, location, and other related attributes.In this scenario, there are 108 IoT devices that simultaneously send sensor data to the database. These devices could be sensors for environmental monitoring, health tracking, or home automation purposes. The data being sent from these devices could include temperature readings, motion detections, humidity levels, or any other relevant sensor data.However, due to the high volume of data being inserted into the 'SensorReadings' table at the same time, the database might encounter performance issues. This could be due to a lack of proper data buffering mechanisms, insufficient indexing, or ineffective data partitioning. As a result, the database's ability to handle such a large influx of concurrent insert operations might be limited, leading to increased write latency or even database locking.These performance issues could impact the efficiency of data processing and analysis in the IoT system. Real-time monitoring and analysis of sensor data might be delayed, affecting the overall functionality and responsiveness of the IoT application or platform.\n" + }, + "217": { + "start_time": "1697328560", + "end_time": "1697328631", + "start_timestamp": "2023-10-15 08:09:20", + "end_timestamp": "2023-10-15 08:10:31", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 108\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 37\n \n # Number of rows to insert\n num_rows = 95\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, 108 sensors generate a large amount of data that needs to be inserted into a database simultaneously. This process of inserting data into a database with 37 columns, each with a size of 92 characters, and a total of 95 rows, causes a database exception.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One table in the database is named 'SensorReadings', and it stores information about different sensor readings. This table contains 95 rows of data, each representing a reading from a sensor, and it has 37 columns, each containing information of up to 92 characters. These columns might include sensor ID, reading type (such as temperature, humidity, pressure), reading value, timestamp, sensor location, and status information. At a specific time, 108 sensors start transmitting data simultaneously at a high frequency. This large influx of data can potentially overwhelm the database, especially if the database is not properly optimized to handle such a high volume of incoming data. Without proper indexing, partitioning, or buffering mechanisms, the database's ability to efficiently process these concurrent write requests may be significantly impacted. This could result in increased write latency, database locking, or other anomalies in the database. The performance of the entire IoT system might be affected, leading to delays in data processing and potential disruptions in the overall functionality of the system.\n" + }, + "218": { + "start_time": "1697328691", + "end_time": "1697328752", + "start_timestamp": "2023-10-15 08:11:31", + "end_timestamp": "2023-10-15 08:12:32", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 140\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 82\n \n # Number of rows to insert\n num_rows = 395\n \n # Size of each column (in characters)\n column_size = 93\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 140 users are trying to perform simultaneous update operations on a database table containing 82 columns and 395 rows of product records, where each column has a size of 93 characters. These users are competing with each other to lock the database table, which may lead to a database exception.\n", + "desc": "In an e-commerce environment, there is a database used for storing and managing product information, called 'ProductDB'. This database contains a key table named 'ProductDetails', which stores details about various products. Each row in this table represents a different product, and there are a total of 395 rows. The table consists of 82 columns, with each column able to store up to 93 characters. These columns may include product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. The database operates in a high-concurrency environment, with 140 users simultaneously accessing and updating the product information. However, due to the high number of concurrent transactions and the lack of proper locking mechanisms, there can be contention for accessing the database, leading to performance issues such as delays, failures, or even data corruption.\n" + }, + "219": { + "start_time": "1697328812", + "end_time": "1697328864", + "start_timestamp": "2023-10-15 08:13:32", + "end_timestamp": "2023-10-15 08:14:24", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 117\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 2809399\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large online store, there are 117 simultaneous searches performed by users in the database table containing 18 columns and 2,809,399 rows of product records, each column size being 92 characters. This search is performed after a vacuum operation, which is a data cleaning process in the database.\n", + "desc": "In an e-commerce database, specifically in the OnlineStoreDB, there is a key table called \"ProductRecords\", which stores detailed information about various products. This table consists of 2,809,399 rows of data, with each row representing a unique product entry. The table has 18 columns, each capable of storing information up to 92 characters in length. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status (e.g., available or out of stock), and image link. At some point, the database administrator needs to perform a \"VACUUM\" operation, which is a maintenance operation used to reclaim space that is no longer being used by the database. This operation involves scanning the entire table and removing any unused space caused by previous updates or deletions. In this case, the \"VACUUM\" operation is being performed with 117 concurrent threads, which means multiple threads will be working simultaneously to perform the operation.\n" + }, + "220": { + "start_time": "1697328924", + "end_time": "1697329039", + "start_timestamp": "2023-10-15 08:15:24", + "end_timestamp": "2023-10-15 08:17:19", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 72\n \n # Number of rows to insert\n num_rows = 462113\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial database that stores transaction records, if multiple users simultaneously perform queries on a table containing 72 columns and 462,113 rows, each column having a size of 50 characters, and there are redundant indexes created initially for transaction details such as account number, transaction type, and date, it may result in additional storage overhead and decreased query performance.\n", + "desc": "In a business intelligence environment, there is a database named 'BI_DB' that stores various types of data used for analysis and reporting. Within this database, there is a table called 'AnalysisData' that contains a total of 462,113 rows of data. Each row represents an individual data entry, and each entry has 72 columns, each capable of storing up to 50 characters. These columns might include data such as transaction ID, customer information, sales figures, date and time, product information, and other relevant details. In order to optimize the performance of complex queries and accelerate the data analysis process, the database administrator has created multiple redundant indexes on the 'AnalysisData' table. These indexes might be based on different columns, such as transaction type, customer location, sales amount, or product category. However, the creation of redundant indexes can consume additional storage space and might lead to performance overhead, impacting the efficiency of query execution and potentially slowing down the overall analysis process.\n" + }, + "221": { + "start_time": "1697329099", + "end_time": "1697329190", + "start_timestamp": "2023-10-15 08:18:19", + "end_timestamp": "2023-10-15 08:19:50", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are trying to upload, download, or edit files at the same time. This creates competition for input/output resources, leading to slower file transfers.\n", + "desc": "In an IoT scenario, we can consider a database named 'IoTDataDB', which is used to store and analyze data collected from various IoT devices. This database contains a key table named 'DeviceReadings', which records sensor readings and other data coming from IoT devices. Suppose this table contains a large volume of data, with each row representing a reading from a specific device. The table has multiple columns, each containing information such as device ID, sensor type, reading value, timestamp, location, and other related data. In this scenario, due to the continuous influx of data from IoT devices, the database might face challenges related to inserting large amounts of data while also dealing with I/O contention issues. When multiple devices attempt to simultaneously send large volumes of data to the database, the system's storage and network bandwidth might become overwhelmed. This I/O contention can lead to slower data insertion speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Additionally, the frequent write operations involved in inserting large amounts of data can impact database performance, potentially causing locking and transaction management issues during peak periods. As a result, the overall efficiency of data collection and analysis in the IoT system might be affected.\n" + }, + "222": { + "start_time": "1697329250", + "end_time": "1697329310", + "start_timestamp": "2023-10-15 08:20:50", + "end_timestamp": "2023-10-15 08:21:50", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics company, multiple data analysts are executing complex queries that involve joining large tables. The queries are resource-intensive and cause contention for CPU resources, resulting in degraded performance.\n", + "desc": "In the business intelligence scenario of an e-commerce platform, there is a database called 'CorporateAnalyticsDB' that stores various business data of a large corporation. This database contains tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', which contain detailed information about sales records, customer profiles, product catalog, and market trends, respectively. The company's analysts often perform complex join queries across these tables to generate comprehensive business reports. These queries involve linking data from the different tables to analyze purchasing behaviors, market insights, and other business metrics. However, due to the large size of the tables and the complexity of the join operations, the performance of these queries is slow. If the join keys are not properly indexed or if the queries are not optimized, they can consume significant time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there could be competition for CPU resources, further impacting query efficiency. This CPU contention may occur when there are too many compute-intensive queries running on the database server or when the server's CPU resources are insufficient to handle the workload.\n" + }, + "223": { + "start_time": "1697329370", + "end_time": "1697329518", + "start_timestamp": "2023-10-15 08:22:50", + "end_timestamp": "2023-10-15 08:25:18", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online shopping platform's database, retrieving a large amount of data and executing related subqueries to find the inventory for each product can lead to performance issues.\n", + "desc": "In an e-commerce platform's database, there is a table called 'ProductInventory' that stores the inventory information of various products. This table contains data for tens of thousands or even hundreds of thousands of products. Each product's inventory information includes details such as the product ID, current stock level, last inventory update time, supplier ID, and warehouse location. In this database, there might be queries that involve fetching large amounts of data and performing correlated subqueries. For example, a common query could be to determine the total current inventory of all products within a specific category. To execute this query, the database would first select all products of the specified category from the 'ProductDetails' table, and then perform subqueries on the 'ProductInventory' table to retrieve the inventory data for these products. When dealing with a large number of products, the performance of these correlated subqueries can become inefficient. This is because executing individual subqueries for each product in the category would take a long time. Additionally, retrieving inventory information for a large number of products might require reading a significant amount of data from the disk, which could result in I/O bottlenecks.\n" + }, + "224": { + "start_time": "1697329579", + "end_time": "1697329651", + "start_timestamp": "2023-10-15 08:26:19", + "end_timestamp": "2023-10-15 08:27:31", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 158\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 99\n \n # Size of each column (in characters)\n column_size = 45\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, there is a need to insert a large amount of data into a database simultaneously. This specific scenario involves inserting data generated by 158 sources, with each source generating data for 20 columns and 99 rows. The size of each column is 45 characters. The purpose of this simulation is to identify any potential issues or exceptions that may arise due to the high volume of data being inserted.\n", + "desc": "In an e-commerce platform, there exists a database called 'ECommerceDB' which is responsible for storing information about various products. One of the tables in this database is 'ProductDetails', which contains detailed data about each product. This table consists of 99 rows, each representing a specific product, and has a total of 20 columns, each capable of holding up to 45 characters of information. These columns may include fields such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. The 'INSERT_LARGE_DATA' anomaly refers to a situation where 158 users attempt to simultaneously insert a large amount of data into the 'ProductDetails' table. This could be due to activities like updating product information, adding new products to the platform, or importing data from external sources. However, since the database is not optimized for handling such a large number of concurrent insert operations, this can lead to performance issues and anomalies in the system. The lack of appropriate buffering mechanisms, indexing, or partitioning strategies can result in slower insert speeds, increased latency, or even database locking. These anomalies can impact the overall efficiency and smooth functioning of the e-commerce platform.\n" + }, + "225": { + "start_time": "1697329711", + "end_time": "1697329783", + "start_timestamp": "2023-10-15 08:28:31", + "end_timestamp": "2023-10-15 08:29:43", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 158\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 34\n \n # Number of rows to insert\n num_rows = 58\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data logging system, there are 158 sensor devices generating a large amount of data. This data needs to be simultaneously inserted into a database table with 34 columns and 58 rows of records, where each column can hold up to 51 characters. This simulation triggers a database exception to simulate the performance impact caused by inserting a large volume of data.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used for collecting and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from 58 sensors. These fields may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, sensor location, and status information.At a particular time, 158 sensors start transmitting data simultaneously at a high frequency. Due to the lack of effective data partitioning in the 'SensorReadings' table and insufficient buffering mechanisms, the database's ability to process these numerous concurrent write requests is limited. This can lead to increased write latency in the database and may even result in database locking, which can ultimately lead to anomalies. Such anomalies can affect the overall performance and reliability of the sensor data collection and analysis system.\n" + }, + "226": { + "start_time": "1697329843", + "end_time": "1697329903", + "start_timestamp": "2023-10-15 08:30:43", + "end_timestamp": "2023-10-15 08:31:43", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 62\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 92\n \n # Number of rows to insert\n num_rows = 320\n \n # Size of each column (in characters)\n column_size = 82\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, there is a situation where 62 users try to update the database simultaneously. The database table has 92 columns, with each column having a size of 82 characters, and there are 320 rows of data. Due to the high number of users competing for database table locks, a database exception is simulated.\n", + "desc": "In the database of an online store, suppose there is an e-commerce database named 'OnlineStoreDB'. The database stores a key table called 'ProductInventory', which contains information about the inventory of various products. In this table, there are 320 rows, each representing an individual product, with a total of 92 columns. These columns include product ID, stock quantity, last inventory update time, supplier ID, warehouse location, and other details, each column being able to hold up to 82 characters.In a specific scenario, there are 62 users simultaneously accessing the database to retrieve inventory information. However, due to the database's design and locking mechanism, when these users try to access the same or adjacent rows in the 'ProductInventory' table, a competition for locking the database table occurs. This fight for locking the table lasts for an unknown duration of time.The presence of this contention, where multiple users are vying for access to the database simultaneously, can lead to performance issues. During this contention, users might experience delays or failures in retrieving inventory information, impacting the overall user experience of the online store.Furthermore, if such contention incidents occur frequently, it can lead to substantial growth in the database's transaction log, potentially consuming excessive storage space and even causing temporary disruptions in the database services. Overall, the occurrence of lock contention can significantly affect the smooth functioning of the online store and disrupt its operations.\n" + }, + "227": { + "start_time": "1697329963", + "end_time": "1697330005", + "start_timestamp": "2023-10-15 08:32:43", + "end_timestamp": "2023-10-15 08:33:25", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 101\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 3551817\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, there are 101 users searching for products using terms such as product name, category, and price range. These users are searching in a database table containing 20 columns and 3,551,817 rows of product records, each with a column size of 51 characters. This search operation occurs after a large-scale data cleaning operation on the database table. The aim is to simulate the database exception caused by this process.\n", + "desc": "In an online store scenario, the database named 'OnlineStoreDB' stores information about various products. It has a key table called 'ProductRecords', which contains detailed information about the products, such as their IDs, names, prices, stock quantities, descriptions, brands, categories, and more. The table consists of approximately 3,551,817 rows of data, with each row representing a separate product. The table has a total of 20 columns, and each column can store up to 51 characters of information. The 'VACUUM' anomaly is triggered in this scenario. The 'VACUUM' operation refers to a process in which the database administrator performs a cleanup to reclaim unused space in the database. In this specific case, the administrator uses the 'VACUUM' operation to optimize the storage and performance of the 'ProductRecords' table. However, due to the large number of threads (101) and the size of the table, this 'VACUUM' operation might take a long time and could potentially impact the database's performance during its execution.\n" + }, + "228": { + "start_time": "1697330065", + "end_time": "1697330179", + "start_timestamp": "2023-10-15 08:34:25", + "end_timestamp": "2023-10-15 08:36:19", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 77\n \n # Number of rows to insert\n num_rows = 698055\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database, when conducting queries with redundant indexes on a database table containing 77 columns, 698,055 rows, each column having a size of 78 characters, and with 5 users performing the queries simultaneously, there may be additional storage requirements and performance overhead due to the redundant indexes.\n", + "desc": "In the business intelligence scenario, we can envision a database named 'BusinessIntelligenceDB' used for storing and analyzing various business data. This database contains multiple tables, one of which is a key table named 'SalesData', recording detailed information about sales transactions. This table contains 698,055 rows of data, each row representing a sales transaction record, with a total of 77 columns, each containing information up to 78 characters. These columns may include transaction ID, customer ID, product ID, sales amount, sales date and time, region, salesperson information, payment method, discount information, and more. In a typical business intelligence analysis process, the database administrator might create multiple indexes to accelerate query performance. These indexes could be based on customer ID, product ID, sales amount, or other frequently used columns. However, if these indexes are redundant and unnecessary, they might increase storage space, impact insert and update performance, and slow down overall query execution.Now, suppose at a specific moment, five users simultaneously execute complex sales analysis queries on the 'SalesData' table in the 'BusinessIntelligenceDB'. To enhance query performance, the administrator might create multiple indexes before the queries start and then delete them after the queries are completed. However, if there are redundant indexes created and dropped frequently, this can result in additional storage usage, performance overhead, and database fragmentation. In a business intelligence environment, this could lead to delayed report generation, impacting the efficiency of the decision-making process.\n" + }, + "229": { + "start_time": "1697330239", + "end_time": "1697330330", + "start_timestamp": "2023-10-15 08:37:19", + "end_timestamp": "2023-10-15 08:38:50", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, there is a high volume of data being uploaded, downloaded, or edited simultaneously. This causes contention for input/output (I/O) resources, resulting in slower file transfers.\n", + "desc": "In a file sharing system, imagine a database called 'TeamFileShareDB' that is used by teams or organizations to share files. This database stores both the files themselves and the metadata related to the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. In this scenario, multiple users upload, download, and edit files at the same time. For example, a project team collaborates on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system is used to store and share large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency of file operations, the 'TeamFileShareDB' database faces challenges related to input/output (I/O) contention. When multiple users simultaneously upload or download large files, the storage and network bandwidth of the system may be significantly strained. This contention can result in slower file transfer speeds, especially in cases of limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may experience locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "230": { + "start_time": "1697330390", + "end_time": "1697330451", + "start_timestamp": "2023-10-15 08:39:50", + "end_timestamp": "2023-10-15 08:40:51", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database for a logistics management system, there is a poor join performance issue when retrieving data related to shipments and routes. Additionally, CPU contention occurs when multiple users simultaneously perform calculations and data processing tasks, causing the system to slow down.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'CorporateAnalyticsDB', which is used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights. Due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server or the server's CPU resources being insufficient to handle these queries.\n" + }, + "231": { + "start_time": "1697330511", + "end_time": "1697330659", + "start_timestamp": "2023-10-15 08:41:51", + "end_timestamp": "2023-10-15 08:44:19", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In the database of an e-commerce platform, there is a scenario where a large amount of data needs to be fetched, and the query involves executing correlated subqueries. This can lead to a performance degradation when retrieving inventory information for each product if the correlated subqueries are not optimized.\n", + "desc": "In an e-commerce scenario, there is a database called 'ProductDatabase' that stores information about various products. One of the key tables in this database is called 'ProductInventory', which contains data about the inventory levels of different products. This table includes columns such as product ID, stock quantity, last update time, supplier ID, and warehouse location. When retrieving the inventory level of specific products, the database might need to perform correlated subqueries, which involve selecting products from another table and then querying the inventory data for those products in the 'ProductInventory' table. However, when there is a large number of products involved in these queries, the performance of these correlated subqueries can become inefficient. This is because retrieving the inventory information for each product requires reading a significant amount of data from the disk, leading to potential I/O bottlenecks.\n" + }, + "232": { + "start_time": "1697330719", + "end_time": "1697330791", + "start_timestamp": "2023-10-15 08:45:19", + "end_timestamp": "2023-10-15 08:46:31", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 191\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 19\n \n # Number of rows to insert\n num_rows = 58\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 191 devices, such as sensors and actuators, simultaneously generate a large amount of data that needs to be inserted into the central database. Each device has 19 data points, with each data point containing 67 characters. The total number of data points is 58. However, as a result of this simultaneous data insertion process, the performance of the database may be affected and an exception may occur.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database named 'SensorDataDB' that stores data collected from various sensors. This database is designed to handle a large volume of data. One of the tables in this database is called 'SensorReadings', which records readings from different sensors. The table consists of 58 rows, each representing a sensor reading, with a total of 19 columns, each containing data of up to 67 characters. These columns might include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this particular situation, the database is experiencing an anomaly related to inserting large amounts of data. When 191 threads simultaneously attempt to insert data into the 'SensorReadings' table, the database's performance becomes degraded. This could be due to a lack of appropriate data partitioning, insufficient buffering mechanisms, or the absence of proper indexing in the table. As a result, the database struggles to handle such a large influx of simultaneous insert operations. This anomaly can lead to increased write latency, potential database locking, and overall performance issues. It is crucial to address these issues to ensure smooth and efficient data insertion in the IoT environment.\n" + }, + "233": { + "start_time": "1697330852", + "end_time": "1697330924", + "start_timestamp": "2023-10-15 08:47:32", + "end_timestamp": "2023-10-15 08:48:44", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 191\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 21\n \n # Number of rows to insert\n num_rows = 99\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, a high number of sensors generate a large amount of data that needs to be inserted into the database simultaneously. This script simulates the scenario by inserting data from 191 sensors into a database table with 21 columns and 99 rows. Each column has a size of 92 characters. This may cause a database exception due to the simultaneous insertion of large amounts of data.\n", + "desc": "In an e-commerce database scenario, there is a database called 'ECommerceDB' that stores information about various products. Within this database, there is a table called 'ProductDetails' which contains 99 rows of data, each representing a product. This table has 21 columns, including product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. Each column can store up to 92 characters of information. Suppose there is a situation where 191 users simultaneously attempt to insert large amounts of data into the 'ProductDetails' table. This can cause performance issues in the database, particularly if appropriate optimization measures such as batch processing, partitioning, or high-speed data transfer mechanisms are not in place. Without these optimizations, the database may experience delays in processing the large number of insert requests, leading to anomalies. Such anomalies could impact user experience, the operational efficiency of the e-commerce platform, and potentially affect its reputation.\n" + }, + "234": { + "start_time": "1697330984", + "end_time": "1697331044", + "start_timestamp": "2023-10-15 08:49:44", + "end_timestamp": "2023-10-15 08:50:44", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 117\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 95\n \n # Number of rows to insert\n num_rows = 209\n \n # Size of each column (in characters)\n column_size = 82\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online database system, 117 users simultaneously attempt to perform updates on a database table containing 95 columns and 209 rows of records. Each column has a size of 82 characters. During the update operations, the users compete with each other to lock the database table, causing contention and potentially resulting in a database exception.\n", + "desc": "In an IoT scenario, suppose there is a database named 'IoTDataDB', which is used to store and analyze data from various IoT devices. This database contains a key table named 'SensorReadings', which records detailed information about sensor readings from 209 devices. Each device generates data at a high frequency, and this data is stored in the 'SensorReadings' table. The table consists of 209 rows of data, each row representing a reading from a specific sensor, with a total of 95 columns, each containing information of up to 82 characters. These columns may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, location, and device status. In this scenario, due to the high number of devices and the frequency at which they generate data, there might be a contention issue in the database related to locking. When multiple devices try to access and write to the 'SensorReadings' table simultaneously, there could be competition and contention for locks on the table. This contention can lead to delays in processing data and can impact the overall efficiency and performance of the database. In extreme cases, it could even result in data loss or incorrect readings being recorded. Managing and resolving this locking contention is crucial to ensuring smooth and efficient operations in the IoT data processing pipeline.\n" + }, + "235": { + "start_time": "1697331105", + "end_time": "1697331145", + "start_timestamp": "2023-10-15 08:51:45", + "end_timestamp": "2023-10-15 08:52:25", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 199\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 19\n \n # Number of rows to insert\n num_rows = 3090193\n \n # Size of each column (in characters)\n column_size = 92\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, there is a need to perform a vacuum operation to reclaim space and optimize the performance of the database. This operation is triggered by simulating 199 users searching in a database table containing 19 columns, 3,090,193 rows, where each column has a size of 92 characters. The purpose is to mimic the potential exception that could occur when a large-scale data cleaning operation coincides with a search by multiple users.\n", + "desc": "In an e-commerce database, there is a table named 'ProductDetails' that stores information about various products. This table consists of 3,090,193 rows, with each row representing a specific product. It includes 19 columns, each capable of holding up to 92 characters, recording details such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production and expiration dates, supplier information, country, rating, number of reviews, sales status, promotional information, image links, and other attributes. During a particular process, 199 concurrent threads are used to perform a VACUUM operation, which is a large-scale data cleanup operation. This operation involves deleting unnecessary or outdated product records from the database to maintain data integrity and optimize database performance. However, without proper optimization measures or performing the operation during periods of low traffic, such a large-scale deletion process can negatively impact database performance and potentially lead to anomalies. This could result in delayed or failed queries, affect other database operations, and potentially hinder the overall functionality and efficiency of the e-commerce platform.\n" + }, + "236": { + "start_time": "1697331205", + "end_time": "1697331319", + "start_timestamp": "2023-10-15 08:53:25", + "end_timestamp": "2023-10-15 08:55:19", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 68\n \n # Number of rows to insert\n num_rows = 958228\n \n # Size of each column (in characters)\n column_size = 75\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database with 68 columns and 958,228 rows, each with a column size of 75 characters, redundant indexes are created for various attributes such as product name, category, and price range before a query operation. Simulate the performance overhead and additional storage caused by these redundant indexes, with 5 users performing the query operation simultaneously.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'SensorDataDB', which is used to collect and analyze data from various sensors. This database is designed to handle a large volume of sensor data and contains multiple tables, one of which is a key table named 'SensorReadings'. This table stores readings from different sensors, including information such as sensor ID, reading type, value, timestamp, and location. Suppose this table has 958,228 rows of data, with each row representing a sensor reading. The table consists of 68 columns, each containing information up to 75 characters. In this scenario, it is possible that multiple sensors are generating data simultaneously, and this data is being written to the database. To optimize query performance and enhance data analysis capabilities, redundant indexes might be created on certain columns of the 'SensorReadings' table. These indexes could be based on factors such as sensor ID, reading type, or timestamp. The intention behind creating these redundant indexes is to accelerate queries that involve filtering or aggregating the sensor data. However, if too many redundant indexes are created or if they are not properly managed, they can lead to several issues.Firstly, the creation of redundant indexes consumes additional storage space, which can impact the overall storage capacity of the database. Secondly, maintaining these indexes during data insertion or deletion operations can result in increased write latency and slower database performance. Moreover, frequent index updates can lead to index fragmentation, further degrading query performance. Lastly, if the indexes are not properly utilized in query execution plans, they can introduce unnecessary overhead and degrade query performance instead of improving it.In an IoT environment, where sensor data is constantly being generated and analyzed in real-time, the presence of redundant indexes can lead to suboptimal database performance. It is crucial to carefully manage and optimize these indexes to ensure efficient and effective data analysis while avoiding unnecessary overhead.\n" + }, + "237": { + "start_time": "1697331379", + "end_time": "1697331470", + "start_timestamp": "2023-10-15 08:56:19", + "end_timestamp": "2023-10-15 08:57:50", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a collaborative document editing system, multiple users are simultaneously uploading, downloading, or editing documents. This creates a competition for input/output resources, leading to slower file transfer speeds.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a system used for monitoring and collecting data from various sensors placed in a smart city. This system includes a database named 'SensorDataDB' that stores and processes the sensor data. The database contains a table named 'SensorReadings' which records the data received from multiple sensors. Each row in this table represents a reading from a specific sensor and includes information such as sensor ID, sensor type, reading value, timestamp, and location. On a regular basis, the sensors transmit a large volume of data to the 'SensorDataDB' database. However, due to the high frequency at which the sensors transmit data and the large volume of data being written to the database, the system may experience Input/Output (I/O) contention. This I/O contention occurs when multiple sensors attempt to write data to the database simultaneously, leading to competition for storage resources and network bandwidth. As a result, the file transfer speeds within the system may become slower, and the database's performance may be impacted. To mitigate this issue, proper optimization techniques should be implemented, such as optimizing the handling of write operations, enhancing the storage infrastructure, or improving the network bandwidth capacity. By doing so, the system can effectively manage the large influx of sensor data and maintain efficient data transfer and processing capabilities.\n" + }, + "238": { + "start_time": "1697331530", + "end_time": "1697331590", + "start_timestamp": "2023-10-15 08:58:50", + "end_timestamp": "2023-10-15 08:59:50", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a multi-threaded system where data is accessed and manipulated simultaneously, there is a scenario where poor join performance and CPU contention can occur. This means that when performing database queries involving multiple tables and joins, the system experiences slow performance due to high CPU usage and contention among multiple threads competing for CPU resources.\n", + "desc": "In a business intelligence scenario, there is a database called 'CorporateAnalyticsDB' that stores and analyzes various business data for a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. These tables contain a large number of rows and columns, recording detailed business information. Analysts in the company frequently need to perform complex join queries across these tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table with the 'CustomerProfiles' table to analyze customer purchasing behaviors. Additionally, they might also need to link these data with the 'ProductCatalog' and 'MarketTrends' tables to gain deeper market insights. However, the performance of these join queries can be slow due to the large size of the tables and the involvement of multiple tables in the join process. If the join keys are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume significant time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there can be competition for CPU resources, further diminishing query efficiency. This CPU contention can occur when there are too many compute-intensive queries running on the database server or when the server's CPU resources are insufficient to handle these queries.\n" + }, + "239": { + "start_time": "1697331650", + "end_time": "1697331798", + "start_timestamp": "2023-10-15 09:00:50", + "end_timestamp": "2023-10-15 09:03:18", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online marketplace, the system needs to retrieve a large amount of data about the inventory for each product. This involves executing related subqueries to gather the necessary information. However, if these subqueries are not optimized, the performance of the system may be negatively affected, resulting in slower retrieval of inventory data.\n", + "desc": "In an IoT scenario, imagine a database called 'SmartHomeDB', which is used to store and manage data from various smart home devices. This database contains multiple tables, one of which is a key table called 'DeviceReadings', which records readings from different types of sensors installed in smart home devices. This table consists of a large number of rows, each representing a reading from a specific sensor, and includes columns such as sensor ID, sensor type (such as temperature, humidity, motion, light, etc.), reading value, timestamp, and device ID. In this IoT environment, it is common for multiple sensors to generate data simultaneously, resulting in a large volume of data being written to the 'DeviceReadings' table. Suppose at a specific moment, a large amount of data is being generated simultaneously by multiple sensors, causing a significant increase in the database's write workload and creating I/O (input/output) contention. This contention occurs when multiple write operations are competing for the database's I/O resources, such as disk reads and writes. As a result, the database may experience slower write speeds and reduced overall performance.\n" + }, + "240": { + "start_time": "1697331859", + "end_time": "1697331930", + "start_timestamp": "2023-10-15 09:04:19", + "end_timestamp": "2023-10-15 09:05:30", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 84\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 80\n \n # Size of each column (in characters)\n column_size = 41\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) application, 84 sensors are generating a large amount of data that needs to be inserted into the database simultaneously. Each data entry consists of 12 columns, where each column can store up to 41 characters. The dataset contains 80 rows of sensor data. This simulates a scenario where the database experiences an exception due to the high volume of data being inserted at the same time.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' specifically designed for storing and analyzing data from various IoT devices. This database is used to collect data from 80 different sensors, including temperature sensors, humidity sensors, motion sensors, light sensors, and more. Each sensor generates data at a high frequency and transmits it to the 'SensorData' table in the database. This table has 12 columns, including sensor ID, sensor type, reading value, timestamp, location, and status. During a specific testing period, 84 simulated IoT devices are continuously generating and transmitting data to the database at a high frequency. Each device generates data for each of the 80 sensors every second. Due to the large number of devices and the high frequency of data generation, the database may face performance issues, especially when it comes to handling the large volume of incoming data. If the database is not properly optimized to handle such a massive influx of data, the insertion of this large amount of data may trigger anomalies in the system. These anomalies could manifest as increased write latency, database locking, or even system crashes. Therefore, it is crucial to ensure that the database is properly configured and optimized to handle such large-scale data insertions in order to maintain the smooth functioning of the IoT system.\n" + }, + "241": { + "start_time": "1697331990", + "end_time": "1697332062", + "start_timestamp": "2023-10-15 09:06:30", + "end_timestamp": "2023-10-15 09:07:42", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 84\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 22\n \n # Number of rows to insert\n num_rows = 70\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 84 IoT devices simultaneously generate a large amount of data that needs to be inserted into the database. Each IoT device has 22 sensors, and each sensor generates data with a size of 66 characters. The database table has a total of 70 rows. This process simulates the database exception caused by the simultaneous insertion of large data from multiple devices.\n", + "desc": "In an e-commerce scenario, there is a database named 'ECommerceDB' used for storing and managing product information. In this database, there is a table called 'ProductData' that contains detailed data about various products. Each row in the table represents a separate product, and there are a total of 70 rows. The table has 22 columns, including attributes such as product ID, name, price, stock quantity, brand, category, size, color, weight, user ratings, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status (e.g. available, out of stock). Each column has a size of 66 characters. Now, suppose there is a situation where 84 users are simultaneously inserting large amounts of new product data into the 'ProductData' table. This could happen, for example, when a company is launching a new product line and needs to bulk insert the product information into the database. If the database is not properly optimized to handle these concurrent insert operations, it can lead to performance issues and anomalies. The efficiency of the insertion process may be compromised, causing delays or failures in the database's ability to accept and process the new product data. This can also impact other database operations and potentially affect the overall operation of the e-commerce platform.\n" + }, + "242": { + "start_time": "1697332122", + "end_time": "1697332182", + "start_timestamp": "2023-10-15 09:08:42", + "end_timestamp": "2023-10-15 09:09:42", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 83\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 96\n \n # Number of rows to insert\n num_rows = 274\n \n # Size of each column (in characters)\n column_size = 85\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, a scenario is simulated where 83 users simultaneously perform frequent update operations in a database table containing 96 columns and 274 rows of product records. Each column has a size of 85 characters, and multiple users compete with each other to lock the database table during the update operation.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data in a smart home environment. This database is called 'SensorDataDB' and it stores information from various types of sensors. One of the key tables in this database is 'SensorReadings', which contains detailed information about sensor readings. There are 274 rows of data in this table, each representing a specific sensor reading. The table has 96 columns, with each column capable of storing up to 85 characters. The columns include sensor ID, reading type (such as temperature, humidity, light intensity), reading value, timestamp, sensor location, and status information.In this scenario, 83 sensors simultaneously transmit data to the database at a high frequency. However, due to the lack of proper database locking mechanisms or inefficient locking strategies, contention issues may occur when multiple threads try to access and update the 'SensorReadings' table simultaneously. This contention can lead to conflicts and delays in processing sensor data, impacting the overall performance of the system. It is crucial to implement appropriate locking mechanisms and strategies to handle concurrent access to the database and avoid such lock contention issues.\n" + }, + "243": { + "start_time": "1697332242", + "end_time": "1697332305", + "start_timestamp": "2023-10-15 09:10:42", + "end_timestamp": "2023-10-15 09:11:45", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 131\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 3107663\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an e-commerce platform's database, users frequently search for products based on various criteria such as product name, category, and price range. However, after performing a large-scale data cleaning operation on a database table with 12 columns and 3,107,663 rows, each column containing 66 characters of product records, an exception occurs when 131 users simultaneously perform a search. This simulates the database's response to a high volume of search queries after a data cleaning operation.\n", + "desc": "In a life scenario related to file sharing, suppose there is a database called 'TeamFileShareDB' that is used by teams or organizations for sharing files. This database not only stores the files themselves but also records metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. In this scenario, the database encounters a VACUUM anomaly. VACUUM is a command used to optimize the database by reclaiming unused space and improving query performance. The script is executed with the anomaly flag set to VACUUM, indicating that the administrator wants to perform a VACUUM operation on the database. The script also specifies various parameters such as the number of threads (131), the number of columns in the table (12), the size of each column (66 characters), and the number of rows in the table (3,107,663). This suggests that the database administrator wants to optimize the storage and performance of the 'TeamFileShareDB' database by reclaiming unused space and improving query performance.\n" + }, + "244": { + "start_time": "1697332365", + "end_time": "1697332478", + "start_timestamp": "2023-10-15 09:12:45", + "end_timestamp": "2023-10-15 09:14:38", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 76\n \n # Number of rows to insert\n num_rows = 600440\n \n # Size of each column (in characters)\n column_size = 64\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database with 76 columns and 600,440 rows, each with a column size of 64 characters, a large number of indexes are created for items such as product name, category, and price range at the beginning of the query. These indexes are deleted after the query operation. Simulate the additional storage footprint and performance overhead caused by this process with 8 users.\n", + "desc": "In the database of an IoT system, there is a database named 'IoTDataDB' that is responsible for storing and processing sensor data. This database is used to gather information from numerous sensors deployed in different locations. Each sensor collects data at regular intervals, and this data is then stored in the 'SensorData' table, which contains detailed information about each sensor reading. The table consists of 600,440 rows of data, with each row representing a sensor reading and a total of 76 columns. These columns may include sensor ID, location, timestamp, temperature, humidity, pressure, air quality, and other relevant sensor measurements. In this scenario, multiple sensors continuously transmit data to the database in real-time. To improve query performance and provide quick insights, the database administrator might create redundant indexes on various columns, such as sensor ID, location, or timestamp. This is done to accelerate query execution and enable faster retrieval of data based on specific criteria.However, the frequent creation of redundant indexes can lead to additional storage usage and increased overhead in the database. Moreover, it can cause fragmentation and impact the performance of queries and write operations. In an IoT environment, such redundancy in indexes could result in delayed processing of real-time data, slower query execution, or even system failures if the database cannot handle the increased load efficiently.\n" + }, + "245": { + "start_time": "1697332538", + "end_time": "1697332629", + "start_timestamp": "2023-10-15 09:15:38", + "end_timestamp": "2023-10-15 09:17:09", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, there is a high volume of file uploads, downloads, and edits happening simultaneously. This creates a competition for input/output (I/O) resources, leading to slower file transfer speeds.\n", + "desc": "In an internet of things (IoT) scenario, there is a database called 'SensorDataDB' used for collecting and analyzing sensor data. This database is designed to handle a large volume of data coming from various sensors. One of the tables in this database is called 'SensorReadings' and it is used to store readings from 100 different sensors. Each reading contains information such as sensor ID, reading type (temperature, humidity, etc.), value, timestamp, sensor location, and status. When all 100 sensors start transmitting data simultaneously at a high frequency, the database might encounter performance issues. This is because it is not properly optimized to handle such a large number of concurrent write requests. Without proper data partitioning, buffering mechanisms, or indexing, the database's ability to process these write requests efficiently is limited. This can lead to increased write latency, I/O contention, and possible anomalies in the database. These anomalies can result in delayed processing, slower data transfer speeds, and overall inefficiency in managing the sensor data.\n" + }, + "246": { + "start_time": "1697332689", + "end_time": "1697332749", + "start_timestamp": "2023-10-15 09:18:09", + "end_timestamp": "2023-10-15 09:19:09", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system used for customer sales analysis, multiple users are attempting to perform join operations on large tables simultaneously. This process puts a heavy load on the CPU and leads to contention, resulting in poor performance of the join operations.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB' that is used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. However, due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. This CPU contention might occur due to too many compute-intensive queries running on the database server or the server's CPU resources being insufficient to handle these queries.\n" + }, + "247": { + "start_time": "1697332809", + "end_time": "1697332958", + "start_timestamp": "2023-10-15 09:20:09", + "end_timestamp": "2023-10-15 09:22:38", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail system, when trying to retrieve a large amount of data, such as the inventory for each product, there is a risk of performance degradation if the query uses inefficient correlated subqueries. This can slow down the system's response time and impact overall user experience.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' that stores information about a wide range of products. Within this database, there is a table named 'ProductInventory' that records the inventory details for each product, such as the stock level, last update time, supplier information, and warehouse location. In order to query the current inventory level of products in a specific category, related subqueries need to be performed. This involves selecting all products in the desired category from the 'ProductDetails' table and then retrieving the corresponding inventory information from the 'ProductInventory' table. However, when the number of products in the category is large, executing individual subqueries for each product can become time-consuming and inefficient. This can result in slower performance of the queries and potential I/O bottlenecks, as a significant amount of data needs to be read from the disk.\n" + }, + "248": { + "start_time": "1697333019", + "end_time": "1697333091", + "start_timestamp": "2023-10-15 09:23:39", + "end_timestamp": "2023-10-15 09:24:51", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 166\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 75\n \n # Size of each column (in characters)\n column_size = 27\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home automation system, 166 sensors are sending a large amount of data simultaneously, which needs to be inserted into the database. Each data point has 5 attributes with a size of 27 characters, and there are a total of 75 data points. This process simulates a database exception due to the high volume of data being inserted.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'IoTDataDB' that is used for collecting and storing data from various sensors. The database is designed to handle a large volume of sensor data, including information on temperature, humidity, pressure, light, and motion. One of the key tables in the database is called 'SensorReadings', which records the readings from 75 sensors. Each reading includes data from 5 columns, such as sensor ID, reading type, reading value, timestamp, and sensor location. During operation, these sensors continuously transmit data to the database, leading to a high volume of concurrent write requests. However, the database might encounter performance issues due to factors such as insufficient buffering, lacking indexing, or inefficient data partitioning. These factors can limit the database's ability to handle these write requests, leading to increased write latency and potential anomalies. These anomalies can manifest as delayed write operations, database locking, or even failure to process incoming sensor data. These issues not only impact the efficient storage and processing of sensor data but also affect the overall performance and reliability of the IoT system.\n" + }, + "249": { + "start_time": "1697333151", + "end_time": "1697333223", + "start_timestamp": "2023-10-15 09:25:51", + "end_timestamp": "2023-10-15 09:27:03", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 166\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 32\n \n # Number of rows to insert\n num_rows = 75\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, a large amount of data generated by 166 sensors needs to be inserted into the database simultaneously. This can simulate the database exception caused by the high volume and frequency of data insertion. The database table contains 32 columns and 75 rows, with each column having a size of 54 characters.\n", + "desc": "In an e-commerce application, suppose there is a database called 'ECommerceDB' used for storing various types of product information. Within this database, there is a table named 'ProductDetails' that contains 75 rows of data, each representing a product, with a total of 32 columns. These columns hold information such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. The size of each column can accommodate up to 54 characters.At a given time, 166 users simultaneously submit requests to insert new product data into this 'ProductDetails' table. However, due to factors like inefficient indexing, limited memory resources, or lack of optimization in the database, handling such a large number of concurrent insertions can cause performance issues. This could result in extended insert latency, high CPU usage, or even database locking. These anomalies can impact the overall efficiency and user experience of the e-commerce platform.\n" + }, + "250": { + "start_time": "1697333283", + "end_time": "1697333343", + "start_timestamp": "2023-10-15 09:28:03", + "end_timestamp": "2023-10-15 09:29:03", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 61\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 87\n \n # Number of rows to insert\n num_rows = 284\n \n # Size of each column (in characters)\n column_size = 85\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online marketplace, 61 users are simultaneously attempting to perform frequent update operations on a database table that contains 87 columns and 284 rows of product records. Each column has a size of 85 characters. These users are competing with each other to lock the database table in order to complete their update operations. As a result, the database system may encounter contention issues, resulting in slower performance and potential exceptions.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for storing and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle data from various types of sensors. One of the key tables in this database is 'SensorReadings', which records detailed information about sensor readings. This table consists of 284 rows of data, each representing a reading from a sensor, with a total of 87 columns, each containing information of up to 85 characters. These columns could include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, 61 sensors are simultaneously sending data to the database. Due to the design of the table and the locking mechanism of the database, concurrent write operations from multiple sensors might result in a contention for locking the table. This contention can cause performance issues, such as delays in processing write requests or even database locking. These performance issues could impact the real-time processing of sensor data and affect the overall functionality of the IoT system.\n" + }, + "251": { + "start_time": "1697333403", + "end_time": "1697333507", + "start_timestamp": "2023-10-15 09:30:03", + "end_timestamp": "2023-10-15 09:31:47", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 137\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 3959949\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online marketplace, if 137 users simultaneously perform a search after a large-scale data cleaning operation on a database table containing 5 columns, 3,959,949 rows, with each column having a size of 54 characters for product records, an exception will occur in the database.\n", + "desc": "In an e-commerce scenario, there is a database named 'StoreDB', which stores information about products for an online store. One of the key tables in this database is called 'ProductDetails', which contains detailed information about each product. This table consists of 3,959,949 rows, with each row representing a product and containing 5 columns. These columns may include product ID, name, price, stock quantity, and description, with each column having a size of 54 characters. On a regular basis, the database administrator needs to optimize the database by cleaning up unused or outdated data. The 'VACUUM' command is used to perform this cleanup operation. However, in this particular case, the cleanup process needs to handle a large number of rows and columns, and it is necessary to execute the command in parallel using 137 threads. If the database administrator does not take appropriate measures to optimize the cleanup process, such as implementing proper data partitioning, batch processing, or scheduling the cleanup during low traffic periods, it could potentially lead to performance issues in the database. These performance issues might include increased response time, CPU and memory contention, or even database lockups, ultimately resulting in anomalies in the system. It is crucial for the administrator to carefully plan and execute the cleanup process to avoid any negative impact on the overall performance and stability of the online store.\n" + }, + "252": { + "start_time": "1697333567", + "end_time": "1697333681", + "start_timestamp": "2023-10-15 09:32:47", + "end_timestamp": "2023-10-15 09:34:41", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 53\n \n # Number of rows to insert\n num_rows = 544087\n \n # Size of each column (in characters)\n column_size = 61\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial institution's database with 53 columns and 544,087 rows, each with a column size of 61 characters, a large number of redundant indexes are created for different financial transaction attributes such as date, transaction type, and amount. These redundant indexes lead to additional storage consumption and performance overhead.\n", + "desc": "In a business intelligence scenario, specifically involving the financial records of a large corporation, a database named 'CorporateFinanceDB' is used to store and analyze financial data. This database contains multiple tables, including a key table named 'FinancialRecords', which records detailed information about various financial transactions and statements. The 'FinancialRecords' table consists of 544,087 rows, each representing a financial record, with a total of 53 columns, each containing information up to 61 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, financial year, audit status, and more. In a business intelligence analysis process, to enhance the efficiency of complex financial queries, the database administrator might create redundant indexes before performing these queries. These indexes could be based on transaction type, date range, department, or project code. However, excessive use of redundant indexes can result in additional storage usage and performance overhead in the database. Moreover, the frequent creation and deletion of indexes might cause database fragmentation, leading to delayed report generation and affecting the decision-making process.\n" + }, + "253": { + "start_time": "1697333741", + "end_time": "1697333832", + "start_timestamp": "2023-10-15 09:35:41", + "end_timestamp": "2023-10-15 09:37:12", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a cloud storage system, multiple users are uploading, downloading, or editing files at the same time. Due to high I/O competition, the file transfer speed is significantly reduced.\n", + "desc": "In an IoT (Internet of Things) scenario, there is a database named 'IoTDataDB' used to store and analyze large volumes of data from various IoT devices. This database contains a key table named 'SensorData', which records data from different sensors deployed in IoT networks. The table consists of a large number of rows, each representing a data reading from a sensor, and multiple columns representing different attributes of the sensor data, such as sensor ID, data type, value, timestamp, location, and device ID. In this particular scenario, there is high concurrent data ingestion into the database, where multiple IoT devices are simultaneously sending data to the database for storage and analysis. These devices might be transmitting data at a very high frequency, and the database needs to efficiently handle this influx of data. However, due to limited storage resources or inefficient I/O (input/output) operations, the database faces challenges related to I/O contention.When multiple IoT devices are simultaneously uploading data to the database, contention for storage space and network bandwidth occurs. This contention can slow down the process of storing and indexing the incoming data, resulting in poorer performance and increased latency in data ingestion. Furthermore, frequent write operations in the database (such as data inserts and updates) can impact the overall I/O performance and the system's ability to handle the high concurrency of data ingestion.To mitigate these issues, various approaches can be taken, such as optimizing storage mechanisms, implementing data buffering techniques, or using distributed data processing frameworks. These measures aim to improve the I/O performance of the database, ensuring efficient ingestion and processing of the large volume of data generated by IoT devices.\n" + }, + "254": { + "start_time": "1697333892", + "end_time": "1697333952", + "start_timestamp": "2023-10-15 09:38:12", + "end_timestamp": "2023-10-15 09:39:12", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis system, multiple data tables are joined together based on a specific condition. However, the join operation is not optimized, and the system faces high CPU contention as multiple processes compete for computing resources. This leads to poor performance in executing the join operation.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB' that stores and analyzes various business data for a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', which record detailed business information. During the analysis process, analysts often need to perform complex join queries across these tables to generate comprehensive business reports. For example, they might join the 'SalesData' table with the 'CustomerProfiles' table to analyze customer purchasing behaviors or link these tables with 'ProductCatalog' and 'MarketTrends' to gain insights into the market. However, due to the large size of these tables and the lack of effective indexing or query optimization, these join queries can become very slow. In addition, when multiple complex join queries are executed simultaneously, there can be competition for CPU resources, further reducing query efficiency. This CPU contention may occur when there are too many compute-intensive queries running on the database server or when the server's CPU resources are insufficient to handle the queries.\n" + }, + "255": { + "start_time": "1697334012", + "end_time": "1697334151", + "start_timestamp": "2023-10-15 09:40:12", + "end_timestamp": "2023-10-15 09:42:31", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a retail platform's database, retrieving a large amount of data related to product inventory may involve executing multiple correlated subqueries. If these subqueries are not optimized efficiently, the performance of retrieving inventory information for each product may be negatively impacted.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' which stores important information about the inventory of various products. One of the tables in this database, named 'ProductInventory', contains data about the inventory of tens of thousands or even hundreds of thousands of products. Each entry in this table includes details such as the product ID, current stock level, last inventory update time, supplier ID, and warehouse location. When querying the inventory level of each product, it often requires performing correlated subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. In order to obtain this information, the database needs to select all products from the 'ProductInventory' table and then perform subqueries to obtain the inventory data for these products. However, when the number of products is very large, the performance of these subqueries can become inefficient. This is because executing individual subqueries for each product to obtain inventory information can be time-consuming. Additionally, retrieving a large amount of data from the disk to fulfill these subqueries can lead to I/O bottlenecks in the database.\n" + }, + "256": { + "start_time": "1697334212", + "end_time": "1697334283", + "start_timestamp": "2023-10-15 09:43:32", + "end_timestamp": "2023-10-15 09:44:43", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 62\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 6\n \n # Number of rows to insert\n num_rows = 71\n \n # Size of each column (in characters)\n column_size = 24\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, multiple sensors are collecting data and need to simultaneously insert a large amount of data into the database. This script simulates the database exception that can occur when 62 sensors attempt to insert data into a table containing 6 columns, 71 rows of data, with each column size being 24 characters.\n", + "desc": "In an e-commerce database called 'ECommerceDB', there is a table named 'ProductSales' that keeps track of sales information for different products. This table contains 71 rows of data, with each row representing a specific product sale. The table has 6 columns, including the sale ID, product ID, sale date, sale quantity, sale price, and customer ID, with each column having a size of 24 characters. The database administrator wants to simulate a scenario where a large amount of sales data is being inserted into the 'ProductSales' table. This is done by running a script 'main.py' with the command 'python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA --threads 62 --ncolumn 6 --colsize 24 --nrow 71'. The script simulates 62 concurrent threads performing the large data insertion operation, with each thread inserting data for a different product sale. This scenario helps identify any performance issues or anomalies that may arise when handling a high volume of concurrent data insertion requests in the e-commerce database.\n" + }, + "257": { + "start_time": "1697334343", + "end_time": "1697334414", + "start_timestamp": "2023-10-15 09:45:43", + "end_timestamp": "2023-10-15 09:46:54", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 62\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 22\n \n # Number of rows to insert\n num_rows = 58\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, 62 sensors generate a large amount of data to be inserted into a database simultaneously. The database table contains 22 columns and 58 rows, with each column having a size of 52 characters. This process simulates a database exception caused by the insertion of large data.\n", + "desc": "In an e-commerce platform, suppose there is a database called 'ECommerceDB' that stores information about various products. Within this database, there is a table named 'ProductDetails' which contains detailed information about products. This table consists of 58 rows of data, with each row representing a specific product. The table has 22 columns, each column having a size of 52 characters. These columns include attributes such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. In this scenario, there are 62 users simultaneously inserting large amounts of data into the 'ProductDetails' table. This could be due to various reasons such as uploading a batch of new products, updating existing product information, or importing data from external sources. However, the database might not be optimized to handle such a large influx of data simultaneously. If there are no proper indexing strategies or efficient data insertion mechanisms in place, the database's performance could suffer. This could result in slower insertion speeds, increased response times, or even possible failures in data insertion operations. Ultimately, these issues can lead to anomalies in the database, affecting overall system performance and the ability to effectively manage products in the e-commerce platform.\n" + }, + "258": { + "start_time": "1697334474", + "end_time": "1697334535", + "start_timestamp": "2023-10-15 09:47:54", + "end_timestamp": "2023-10-15 09:48:55", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 108\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 55\n \n # Number of rows to insert\n num_rows = 400\n \n # Size of each column (in characters)\n column_size = 93\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 108 users simultaneously attempt to perform frequent update operations in a database table containing 55 columns and 400 rows of product records, each with a column size of 93 characters. Multiple users compete with each other to lock the database table to perform the update operation.\n", + "desc": "In a banking scenario, there is a database named 'BankingDB' that stores customer and transaction data for a bank. This database contains a table called 'AccountTransactions' that records detailed information about various banking transactions. The table consists of 400 rows of data, each representing a transaction record for an account, with a total of 55 columns, each containing information up to 93 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more.In a typical banking scenario, there are often multiple users, such as bank staff, automated systems, or customers through an online banking platform, who simultaneously perform frequent update operations on the 'AccountTransactions' table. These operations could involve updating transaction statuses, modifying transaction amounts, or adding transaction notes.Suppose at a specific moment, 108 users almost simultaneously try to update the same or adjacent rows in this table. Due to the table's design and the database's locking mechanism, these concurrent update operations could lead to contention for locking the database table. If such locking lasts for a significant duration, it could trigger performance issues in the database.During peak banking hours, such prolonged locking could result in delayed processing or failure of other users' transaction requests, thereby affecting the daily operations of the bank. Moreover, if such incidents occur frequently, they could also lead to rapid growth in the database transaction log, consuming excessive storage space, and might even cause temporary interruptions in database services.\n" + }, + "259": { + "start_time": "1697334595", + "end_time": "1697334640", + "start_timestamp": "2023-10-15 09:49:55", + "end_timestamp": "2023-10-15 09:50:40", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 181\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 6\n \n # Number of rows to insert\n num_rows = 2583250\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online marketplace, there is a need to optimize the performance of search queries. In this case, there are 181 users simultaneously searching for products in a table with 6 columns and 2,583,250 rows of records, with each column containing 52 characters. By running the script provided, a simulation is performed to trigger an exception in the database due to the lack of proper indexing during the search process.\n", + "desc": "In an e-commerce scenario, there is a database named 'ProductDB' used for an online store. This database contains a main table named 'ProductDetails' which stores information about various products. The 'ProductDetails' table consists of 2,583,250 rows of data, with each row representing a product entry. This table has 6 columns, each column being a maximum of 52 characters in size. The columns in this table include product ID, name, price, stock quantity, description, and category.In order to optimize the database and improve its performance, the database administrator needs to perform a VACUUM operation. VACUUM is a database maintenance operation that reclaims storage space and improves query performance by rearranging and optimizing the physical storage of data.However, performing a VACUUM operation on such a large table with a high number of threads (181) can result in performance issues. This is because the VACUUM operation involves scanning and updating a large amount of data, which can be resource-intensive. In addition, the high number of threads can lead to contention for system resources, such as CPU and disk I/O, further impacting the performance of the database.\n" + }, + "260": { + "start_time": "1697334700", + "end_time": "1697334814", + "start_timestamp": "2023-10-15 09:51:40", + "end_timestamp": "2023-10-15 09:53:34", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 79\n \n # Number of rows to insert\n num_rows = 811500\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a large-scale database of an e-commerce platform, 8 users query a table containing 79 columns and 811,500 rows of product records. Each column has a size of 78 characters. The script simulates the creation and deletion of redundant indexes for attributes such as product name, category, and price range, resulting in additional storage overhead and performance degradation.\n", + "desc": "In an e-commerce database named 'OnlineStoreDB', there is a key table called 'ProductRecords' which stores detailed information about products. This table has a total of 811,500 rows, with each row representing a separate product entry. It consists of 79 columns, each containing information of up to 78 characters. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes. The 'REDUNDANT_INDEX' anomaly occurs when the administrator frequently creates and deletes multiple indexes in order to optimize complex queries. This can lead to additional storage usage and performance overhead in the database. Additionally, frequent index operations can cause database fragmentation, resulting in delayed report generation and impacting the decision-making process in a business intelligence environment. In this particular scenario, the anomaly is triggered by executing the 'REDUNDANT_INDEX' anomaly with 8 threads, 79 columns, 78 characters column size, and 811,500 rows of data.\n" + }, + "261": { + "start_time": "1697334875", + "end_time": "1697334965", + "start_timestamp": "2023-10-15 09:54:35", + "end_timestamp": "2023-10-15 09:56:05", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users share files, there is a concurrent uploading, downloading, or editing of files. This leads to I/O contention, which causes a slowdown in the file transfer process.\n", + "desc": "In the file sharing system scenario, we can envision a database named 'TeamFileShareDB', which is a system used by teams or organizations for sharing files. This database not only stores the files themselves but also records the metadata of the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users might be simultaneously uploading, downloading, or editing files. For example, a project team is collaborating to complete an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to such high concurrency in file operations, the database 'TeamFileShareDB' faces challenges of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth might be significantly strained. This I/O contention can lead to slower file transfer speeds, especially in situations of limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database (such as file uploads and metadata updates) can impact database performance. During peak periods, the database might encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "262": { + "start_time": "1697335025", + "end_time": "1697335086", + "start_timestamp": "2023-10-15 09:57:05", + "end_timestamp": "2023-10-15 09:58:06", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis system, multiple users are executing join queries simultaneously on a database table with poor join performance. The system experiences contention for CPU resources, which slows down the execution of these queries.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'CorporateAnalyticsDB' used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights. However, due to factors such as ineffective indexing of join keys or suboptimal query optimization, executing these join operations can be very slow. This can significantly impact the performance of these complex queries, making them time-consuming and resource-intensive. Additionally, when multiple complex join queries are executed simultaneously, it can result in competition for CPU resources, further reducing the overall query efficiency. This CPU contention might occur due to either too many compute-intensive queries running on the database server or insufficient CPU resources to handle the workload effectively.\n" + }, + "263": { + "start_time": "1697335146", + "end_time": "1697335295", + "start_timestamp": "2023-10-15 09:59:06", + "end_timestamp": "2023-10-15 10:01:35", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, when trying to fetch a large amount of data, specifically the inventory for each product, the process may involve executing related subqueries. If these subqueries are not optimized, the performance of querying inventory may deteriorate.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'SensorDataDB' that is used to store and analyze data collected from various sensors. This database contains a key table called 'SensorReadings' which stores detailed information about sensor readings. Each row in this table represents a reading from a sensor and includes information such as sensor ID, reading type (e.g., temperature, humidity, pressure), reading value, timestamp, and sensor location.In this scenario, there might be a need to fetch large amounts of data from the 'SensorReadings' table, especially when performing complex analysis or data processing tasks. For example, one might want to retrieve all sensor readings from a specific location or within a certain time range. When the database is queried to fetch large amounts of data or to perform correlated subqueries, the performance of these queries can be negatively impacted. This is particularly true when there is a lack of appropriate indexing or optimization techniques, as it can result in slower query execution times and increased resource usage. Additionally, when multiple complex queries are executed simultaneously, it can lead to competition for database resources and further degrade the performance of these fetch operations.\n" + }, + "264": { + "start_time": "1697335355", + "end_time": "1697335426", + "start_timestamp": "2023-10-15 10:02:35", + "end_timestamp": "2023-10-15 10:03:46", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 84\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 64\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data logging system, 84 sensors are simultaneously generating a large amount of data. This data needs to be inserted into a database table with 10 columns and 64 rows. Each column has a size of 51 characters. The goal is to simulate the database exception caused by this insert operation.\n", + "desc": "In a file sharing system scenario, let's imagine a database called 'FilesDB' that is used by a team or organization for sharing files. This database not only stores the files themselves but also records metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. Multiple users may be simultaneously uploading, downloading, or editing files using this system. For example, a project team collaborates on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. In this specific case, 84 users are simultaneously performing large-scale data insertion, where each insertion involves creating a new row in the 'FilesDB' database. This could potentially result in high write activity to the database and could impact its performance, especially if the database is not properly optimized or lacks sufficient resources to handle such a large volume of concurrent write requests. This could lead to slower upload speeds, delayed file availability, or even temporary service interruptions, depending on the system's capability to handle the increased workload.\n" + }, + "265": { + "start_time": "1697335487", + "end_time": "1697335558", + "start_timestamp": "2023-10-15 10:04:47", + "end_timestamp": "2023-10-15 10:05:58", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 84\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 39\n \n # Number of rows to insert\n num_rows = 60\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an industrial monitoring system, 84 sensors are generating a large amount of data simultaneously. This data needs to be inserted into a database table containing 39 columns and 60 rows. Each column can hold up to 66 characters. By simulating this scenario, we can trigger a database exception caused by the insertion of large data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that is used for collecting and analyzing sensor data from various IoT devices. This database contains a key table called 'SensorData', which stores data from 60 sensors. Each row in this table represents a data entry for a specific sensor reading, with a total of 39 columns, each containing information of up to 66 characters. These columns may include sensor ID, sensor type (such as temperature, humidity, pressure, light), timestamp, location, status, and other relevant data. During a specific period, 84 sensors of different types start transmitting data to the database simultaneously. However, due to the large number of concurrent write requests and the lack of effective data partitioning or indexing on the 'SensorData' table, the database might face performance issues. As a result, the database's ability to process these numerous write requests efficiently may be limited. This can lead to increased write latency and affect the overall performance and responsiveness of the database. Such anomalies could hinder the real-time processing and analysis of sensor data, impacting the functionality and effectiveness of the IoT system.\n" + }, + "266": { + "start_time": "1697335618", + "end_time": "1697335678", + "start_timestamp": "2023-10-15 10:06:58", + "end_timestamp": "2023-10-15 10:07:58", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 79\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 93\n \n # Number of rows to insert\n num_rows = 384\n \n # Size of each column (in characters)\n column_size = 62\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online store platform, there are 79 users concurrently attempting to perform frequent update operations on a database table. The table contains 93 columns and 384 rows of product records, with each column having a size of 62 characters. These users are competing for locks on the database table, causing contention and potential exceptions in the process.\n", + "desc": "In a banking scenario, there is a database called 'BankingDB' that handles customer and transaction data for a bank. The database contains a table named 'AccountTransactions' which stores detailed information about various banking transactions. This table consists of 384 rows of data, where each row represents a transaction record for an account. There are a total of 93 columns in this table, each containing information of up to 62 characters. These columns include transaction ID, account number, transaction type (e.g., deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more.In this scenario, there are 79 users attempting to perform frequent update operations on the 'AccountTransactions' table simultaneously. These operations could involve updating transaction statuses, modifying transaction amounts, or adding transaction notes. Due to the design of the table and the locking mechanism of the database, these concurrent update operations may result in contention for locking the table. If the locking is prolonged, it can cause performance issues in the database.During peak business hours, such prolonged locking can lead to delays or failures in processing other users' transaction requests, thereby affecting the daily operations of the bank. If these incidents happen frequently, they can also cause the transaction log of the database to grow rapidly, consuming excessive storage space, and potentially causing temporary interruptions in database services.\n" + }, + "267": { + "start_time": "1697335738", + "end_time": "1697335852", + "start_timestamp": "2023-10-15 10:08:58", + "end_timestamp": "2023-10-15 10:10:52", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 140\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2402256\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online store database, if 140 users simultaneously perform a search after a large-scale data cleaning operation on a database table containing 10 columns, 2,402,256 rows, and each column size of 66 characters for product records, an exception is caused.\n", + "desc": "In an e-commerce scenario, imagine a database called 'ECommerceDB' that stores information about various products. This database includes a key table called 'ProductDetails', where each row represents a product and contains 10 columns, each capable of holding up to 66 characters. These columns may include product ID, name, price, stock quantity, brand, category, size, color, weight, and description. In this case, the database administrator needs to perform a vacuum operation, which is a process of reclaiming storage space by removing unnecessary data and optimizing database performance. The database administrator sets the number of threads to 140, indicating a high level of concurrency in the vacuum operation. Additionally, the 'ProductDetails' table has a total of 2,402,256 rows, meaning that a substantial amount of data needs to be processed. The vacuum operation aims to improve the database's overall efficiency by reclaiming space and optimizing data storage.\n" + }, + "268": { + "start_time": "1697335912", + "end_time": "1697336027", + "start_timestamp": "2023-10-15 10:11:52", + "end_timestamp": "2023-10-15 10:13:47", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 86\n \n # Number of rows to insert\n num_rows = 595153\n \n # Size of each column (in characters)\n column_size = 57\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used for an e-commerce website, 9 users perform a query operation on a table containing 86 columns and 595,153 rows, with each column containing data of size 57 characters. However, before the query operation, a large number of indexes are created for attributes like product name, category, and price range. This can lead to additional storage requirements and performance overhead.\n", + "desc": "In a business intelligence scenario, involving the financial statements of large companies, there is a database called 'CorporateFinanceDB' that stores and processes financial data. This database contains multiple tables, one of which is the 'FinancialRecords' table, which records various financial transactions and statement information. The 'FinancialRecords' table has a total of 595,153 rows, each representing a financial record, with 86 columns, each containing information of up to 57 characters. These columns include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and more. In this scenario, the database administrator creates redundant indexes before executing financial queries to accelerate query performance. This involves creating indexes based on various criteria such as transaction type, date range, department, or project code. However, during peak periods, when multiple users simultaneously execute complex financial queries that require the creation of these redundant indexes, it can lead to additional storage usage and performance overhead in the database. The frequent creation and deletion of indexes can also cause database fragmentation and impact overall system performance. As a result, this scenario could experience delayed report generation and affect the efficiency of decision-making processes.\n" + }, + "269": { + "start_time": "1697336087", + "end_time": "1697336178", + "start_timestamp": "2023-10-15 10:14:47", + "end_timestamp": "2023-10-15 10:16:18", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are simultaneously uploading, downloading, or editing files, there is a competition for input/output resources. This competition causes a slowdown in file transfer operations.\n", + "desc": "In the file transfer system scenario, there is a database called 'TeamFileShareDB' that is used by teams or organizations for sharing files. This database stores both the files themselves and the metadata associated with them, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. Users often perform actions such as uploading, downloading, and editing files simultaneously. For example, a project team might be collaborating on an important report, where team members frequently upload the latest versions of files and others download them for viewing or editing. The system also supports storing and sharing large files like presentations, video conference recordings, and design drawings. With so many users performing file operations concurrently, the database encounters I/O (input/output) contention. This means that when multiple users are uploading or downloading large files at the same time, the system's storage and network bandwidth may become strained. As a result, file transfer speeds may slow down, especially if there is limited bandwidth or insufficient server processing capabilities. Additionally, frequent write operations in the database, such as file uploads and metadata updates, can impact overall database performance. During peak usage periods, the database may experience issues with locking and transaction management, leading to further delays in file processing and metadata recording.\n" + }, + "270": { + "start_time": "1697336238", + "end_time": "1697336299", + "start_timestamp": "2023-10-15 10:17:18", + "end_timestamp": "2023-10-15 10:18:19", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database management system, when performing a join operation on multiple tables to retrieve data, if the tables are large and the join operation is not optimized, it can result in poor performance. Additionally, if there is contention for CPU resources due to multiple processes running simultaneously, it can further impact the join performance.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'IoTDataDB' that stores data collected from various types of sensors. This database is designed to handle a large volume of sensor data, including temperature, humidity, pressure, light, motion, and more. The primary table in the database is called 'SensorReadings', which contains information about the sensor ID, reading type, reading value, timestamp, sensor location, and status. In this scenario, due to poor performance in joining queries, the database encounters slowdowns when analysts try to join multiple tables to generate comprehensive reports. For example, they might need to join the 'SensorReadings' table with other tables that contain information about the sensor types, locations, or environmental conditions. If the join keys are not properly indexed or if the queries are not optimized, executing these join operations can be time-consuming and resource-intensive. Moreover, in situations where multiple complex join queries are executed simultaneously, there may be competition for CPU resources, leading to further degradation in query performance.\n" + }, + "271": { + "start_time": "1697336359", + "end_time": "1697336499", + "start_timestamp": "2023-10-15 10:19:19", + "end_timestamp": "2023-10-15 10:21:39", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online store's back-end system, the script will simulate a scenario where the performance of querying inventory for a large number of products is affected due to the execution of related subqueries. This can happen when trying to retrieve information about the available quantity of each product, and the system is not optimized to handle such queries efficiently.\n", + "desc": "In the e-commerce scenario, we have a database called 'ProductDataDB' that stores information about various products. The database contains a key table named 'ProductDetails' that records details about different products, such as their ID, name, price, stock quantity, brand, category, size, color, weight, user rating, and number of reviews, amongst others. In this case, the anomaly being triggered is a fetch of large data with correlated subquery. This means that there is a query being performed that involves retrieving a large amount of data from the 'ProductDetails' table and using correlated subqueries. Correlated subqueries are queries that depend on the results of other subqueries.When executing this query, the database may encounter performance issues due to the large amount of data being retrieved and the complexity of the subqueries. The database may need to read a significant amount of data from the disk, leading to potential I/O bottlenecks. These bottlenecks can result in slower query execution times and reduced overall database performance.\n" + }, + "272": { + "start_time": "1697336560", + "end_time": "1697336632", + "start_timestamp": "2023-10-15 10:22:40", + "end_timestamp": "2023-10-15 10:23:52", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 162\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 13\n \n # Number of rows to insert\n num_rows = 62\n \n # Size of each column (in characters)\n column_size = 79\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data streaming system, 162 data sources are simultaneously inserting a large amount of data into a database table with 13 columns. Each column can store up to 79 characters, and the table has a total of 62 rows. This process is causing a database exception due to the high volume of data being inserted at once.\n", + "desc": "In a file sharing system scenario, we can imagine a database named 'FileShareDB', which is used for sharing files among users. This database stores both the files themselves and metadata related to the files, such as file names, file sizes, upload dates, and download counts. On a typical day, multiple users might be simultaneously uploading or downloading files. For example, a team might be collaborating on a project and constantly uploading and downloading files to share updates. Additionally, the system might be used for storing and sharing various types of files, such as documents, images, or videos. When a large number of users are concurrently uploading or downloading files, the database might face performance issues. This can occur due to the lack of proper optimizations for handling concurrent file operations, such as insufficient buffering mechanisms, improper indexing, or inefficient data partitioning. As a result, the system might experience delays in file transfers or decreased responsiveness to user requests. Furthermore, the database server's resources, such as CPU and disk I/O, might be strained, leading to reduced performance and potential bottlenecks.\n" + }, + "273": { + "start_time": "1697336692", + "end_time": "1697336764", + "start_timestamp": "2023-10-15 10:24:52", + "end_timestamp": "2023-10-15 10:26:04", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 162\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 34\n \n # Number of rows to insert\n num_rows = 67\n \n # Size of each column (in characters)\n column_size = 74\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analytics platform, there is a scenario where 162 data sources simultaneously generate a large amount of data. Each data source has 34 columns with a column size of 74 characters, and there are a total of 67 rows. The data needs to be inserted into the database, but the high volume of data and simultaneous insertion can cause exceptions or performance issues. The script simulates this scenario to test the system's resilience.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'IoTDataDB', which is responsible for collecting and storing data from various sensors. This database is designed to handle a large volume of sensor data, and it contains a key table named 'SensorData' for storing this information. Each row in this table represents a data record from a sensor, and there are 67 rows of data in total. The table has 34 columns, each column capable of storing information up to 74 characters. The columns contain data such as sensor ID, sensor type, measured value, timestamp, location, sensor status, and more.When there is a spike in the number of connected sensors, such as 162 sensors transmitting data simultaneously, it can put a strain on the database's performance. The increased number of write requests, specifically the insertion of large amounts of data, might overwhelm the system's capacity to handle these requests efficiently. Without appropriate optimization measures, like data partitioning or indexing, the database might experience performance issues, such as longer write latencies and increased response times. These performance issues could result in anomalies in the database, impacting the overall operation of the IoT system.\n" + }, + "274": { + "start_time": "1697336824", + "end_time": "1697336884", + "start_timestamp": "2023-10-15 10:27:04", + "end_timestamp": "2023-10-15 10:28:04", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 186\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 70\n \n # Number of rows to insert\n num_rows = 394\n \n # Size of each column (in characters)\n column_size = 65\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, there is a situation where 186 users simultaneously attempt to perform frequent update operations on a database table containing 70 columns and 394 rows of product records, with each column having a size of 65 characters. These users compete with each other to lock the database table and perform the update operations.\n", + "desc": "In an Internet of Things scenario, there is a database named 'SensorAnalyticsDB' that stores and analyzes sensor data from various devices. This database contains a table called 'SensorReadings', which stores data from different types of sensors. The 'SensorReadings' table has a total of 394 rows, each representing a reading from a sensor, with 70 columns containing information such as sensor ID, sensor type, reading value, timestamp, location, and status. In this scenario, 186 devices are simultaneously transmitting sensor data to the database. Due to the high number of concurrent write requests, the database might encounter lock contention issues. This means that when multiple devices try to write data to the same or adjacent rows in the 'SensorReadings' table, there could be a competition for locking the database, resulting in performance issues. This contention could lead to delayed processing or failure of write requests from other devices, affecting the efficiency of the data collection and analysis process.\n" + }, + "275": { + "start_time": "1697336944", + "end_time": "1697336985", + "start_timestamp": "2023-10-15 10:29:04", + "end_timestamp": "2023-10-15 10:29:45", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 110\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 13\n \n # Number of rows to insert\n num_rows = 3469806\n \n # Size of each column (in characters)\n column_size = 74\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the inventory management system of a large retail store, 110 employees simultaneously perform a product search after a data cleaning operation on a database table containing 13 columns, 3,469,806 rows, with each column having a size of 74 characters. This may lead to a database exception due to the increased workload on the database server.\n", + "desc": "In an e-commerce platform database named 'OnlineStoreDB', which is used for an online store, there is a table called 'ProductRecords' that stores detailed information about various products. This table contains 3,469,806 rows of data, each representing a specific product. The table consists of 13 columns, each column containing information of up to 74 characters. These columns may include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, and supplier information. At a particular moment, while the platform is experiencing high user traffic, a database vacuum operation is executed. This operation involves freeing up space in the database by reclaiming unused disk space and optimizing the storage structure. However, due to the large number of rows and columns in the 'ProductRecords' table, and the high concurrency of user requests (110 threads), the vacuum operation might take a considerable amount of time to complete. This can potentially lead to an inefficient use of system resources and impact the overall performance of the database.\n" + }, + "276": { + "start_time": "1697337045", + "end_time": "1697337161", + "start_timestamp": "2023-10-15 10:30:45", + "end_timestamp": "2023-10-15 10:32:41", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 95\n \n # Number of rows to insert\n num_rows = 751014\n \n # Size of each column (in characters)\n column_size = 91\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial banking database with 95 columns and 751,014 rows, each with a column size of 91 characters, multiple users try to create redundant indexes for different attributes like customer name, account number, transaction amount, etc., causing additional storage consumption and performance overhead.\n", + "desc": "In a business intelligence scenario, there is a database named 'BusinessDataDB' used for storing and analyzing various business data. The database contains multiple tables, one of which is a key table named 'BusinessRecords', recording detailed information about various business activities. This table consists of 751,014 rows of data, each row representing a business record, with a total of 95 columns, each containing information of up to 91 characters. These columns may include transaction ID, transaction type, amount, date, department, project code, budget code, financial year, audit status, and more. In this scenario, there is a need to perform index acceleration for complex queries such as management reports, financial analysis, or trend analysis. As a result, the database administrator might create redundant indexes on the 'BusinessRecords' table to improve query performance. However, if the indexes are not properly managed or utilized, they can lead to duplicate or excessive indexes, causing additional storage usage and performance overhead in the database. This can result in inefficient query execution, delayed report generation, and reduced efficiency in decision-making processes.\n" + }, + "277": { + "start_time": "1697337221", + "end_time": "1697337312", + "start_timestamp": "2023-10-15 10:33:41", + "end_timestamp": "2023-10-15 10:35:12", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are simultaneously uploading, downloading, or editing files. This leads to competition for input/output resources, causing slower file transfers.\n", + "desc": "In a file sharing system scenario, let's consider a database called 'TeamFileShareDB', which is a platform used by teams or organizations to share files. This database not only stores the actual files but also records their metadata, including information such as the uploader's details, file size, creation and modification dates, version history, access permissions, and download counts. On a typical day, multiple users might be simultaneously uploading, downloading, or editing files. For instance, a project team may work together to complete an important report, with team members frequently uploading the latest versions of files that others then download for viewing or editing. Additionally, the system might accommodate large file storage, such as presentations, video conference recordings, or design drawings. Due to the high concurrency involved in file operations, the 'TeamFileShareDB' database faces challenges related to I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth might become significantly strained. This I/O contention leads to slower file transfer speeds, especially in situations with limited bandwidth or insufficient server processing capabilities. Furthermore, the frequent write operations in the database, such as file uploads and metadata updates, can impact overall database performance. During peak usage times, the database might encounter locking and transaction management issues, further slowing down the file processing and metadata recording processes.\n" + }, + "278": { + "start_time": "1697337372", + "end_time": "1697337432", + "start_timestamp": "2023-10-15 10:36:12", + "end_timestamp": "2023-10-15 10:37:12", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a business intelligence system, multiple users perform join operations on a large table using complex conditions. These join operations place a heavy load on the CPU, causing contention and reducing the performance of the system.\n", + "desc": "In a business intelligence scenario, imagine a database named 'CorporateAnalyticsDB', used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights.Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "279": { + "start_time": "1697337492", + "end_time": "1697337642", + "start_timestamp": "2023-10-15 10:38:12", + "end_timestamp": "2023-10-15 10:40:42", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, imagine an e-commerce platform where a manager wants to analyze the inventory for each product. However, the database is not optimized and the query to fetch this large amount of data requires executing correlated subqueries. This leads to a slowdown in performance, making it difficult for the manager to efficiently analyze the inventory.\n", + "desc": "In an e-commerce platform database, called 'ECommerceDB', there exists a table named 'ProductInventory' that stores information about the inventory levels of various products. This table contains data for tens of thousands or even hundreds of thousands of products. Each product's inventory information includes details such as the product ID, current stock level, last inventory update time, supplier ID, and warehouse location. When querying the inventory level of products, it is common to perform related subqueries. For example, one might want to determine the total current inventory of all products within a specific category. To do this, the database would need to select all products of that category from the 'ProductDetails' table and then conduct subqueries on the 'ProductInventory' table to obtain the inventory data for those products. However, when the number of products is large, these related subqueries can become inefficient. In particular, if a category includes thousands of products, it would be time-consuming to execute individual subqueries for each product. In such cases, the database may need to read a significant amount of data from the disk, leading to I/O bottlenecks.\n" + }, + "280": { + "start_time": "1697337702", + "end_time": "1697337774", + "start_timestamp": "2023-10-15 10:41:42", + "end_timestamp": "2023-10-15 10:42:54", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 149\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 7\n \n # Number of rows to insert\n num_rows = 78\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a manufacturing facility, there are 149 machines producing data that needs to be inserted into a database table. Each machine has 7 data points with a column size of 63 characters. The database is set up to handle 78 rows of data. Running this script will simulate the exception that occurs when trying to insert a large amount of data from multiple machines into the database.\n", + "desc": "In a database used for an e-commerce platform named 'ECommerceDB', a table named 'ProductData' holds information about various products available for sale. This table contains 78 rows of data, each representing a specific product entry. Each row consists of 7 columns, each column storing information up to 63 characters long. These columns may include attributes such as product ID, name, description, price, brand, category, and availability status.The scenario described above involves a situation where multiple users are simultaneously inserting a large amount of data into the 'ProductData' table. This could be due to a variety of reasons, such as the addition of new products to the platform or updates to existing product information.If the database is not properly optimized or lacks efficient data insertion mechanisms, this influx of large-scale data could lead to performance issues. Specifically, the insertion process might become slower and less responsive. In some cases, it could even result in database lock ups or failures, negatively impacting the functionality and reliability of the e-commerce platform.\n" + }, + "281": { + "start_time": "1697337834", + "end_time": "1697337906", + "start_timestamp": "2023-10-15 10:43:54", + "end_timestamp": "2023-10-15 10:45:06", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 149\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 36\n \n # Number of rows to insert\n num_rows = 73\n \n # Size of each column (in characters)\n column_size = 72\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, 149 sensors generate a large amount of data that needs to be inserted simultaneously into a database table with 36 columns and 73 rows. Each column has a size of 72 characters. This scenario simulates a database exception caused by the high volume of data insertion.\n", + "desc": "In a file sharing system, there is a database called 'FileShareDB', which is used by various teams or organizations to share files. This database stores the files themselves as well as metadata about the files, such as the uploader's information, file size, creation date, modification date, version history, access permissions, and download counts. On a regular day, multiple users are uploading, downloading, or editing files concurrently. For example, a project team may be collaborating on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system supports the storage and sharing of large files such as presentations, video conference recordings, or design drawings. However, when a large number of users simultaneously upload or download large files, the system's storage and network bandwidth can become strained, resulting in slower file transfer speeds. Moreover, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may experience issues with locking and transaction management, further slowing down file processing and metadata recording.\n" + }, + "282": { + "start_time": "1697337966", + "end_time": "1697338027", + "start_timestamp": "2023-10-15 10:46:06", + "end_timestamp": "2023-10-15 10:47:07", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 149\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 87\n \n # Number of rows to insert\n num_rows = 327\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online platform, when 149 users simultaneously compete to perform frequent update operations in a database table containing 87 columns and 327 rows of data, with each column having a size of 54 characters, there is contention for locking the database table. This can lead to a database exception.\n", + "desc": "In a banking scenario, the system is experiencing lock contention issues. This database, named 'BankDB', is used by a bank to handle customer and transaction data. It contains a table called 'TransactionRecords' which records various transaction details. In this specific case, the table has 327 rows of data, with each row representing a transaction, and a total of 87 columns, each containing information up to 54 characters. These columns may include transaction ID, account number, transaction type, amount, date and time, counterparty information, transaction status, employee ID, transaction location, and currency type. At a certain moment, 149 users attempt to perform transactions simultaneously, resulting in contention for locks on the database table. This competition for locks among the users can lead to delays in transaction processing or even failures. This lock contention issue can impact the overall performance and efficiency of the banking system. Without appropriate measures, such as optimizing locking mechanisms, implementing concurrent transaction handling techniques, or utilizing distributed computing, the system might experience frequent lock conflicts, negatively impacting customers' transaction experiences and potentially leading to financial losses or reputational damage for the bank.\n" + }, + "283": { + "start_time": "1697338087", + "end_time": "1697338148", + "start_timestamp": "2023-10-15 10:48:07", + "end_timestamp": "2023-10-15 10:49:08", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 196\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 7\n \n # Number of rows to insert\n num_rows = 3888722\n \n # Size of each column (in characters)\n column_size = 72\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a social media platform, there are 196 users searching for posts in a table that contains 7 columns, 3,888,722 rows, with each column size being 72 characters. However, after performing a vacuum operation on the database table, an exception occurs when these users simultaneously search for posts.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' that is used for collecting and analyzing data from various IoT devices. This database is designed to handle a large volume of data from sensors, devices, or machines that are connected to the IoT network. The primary table in the database is called 'SensorData', which contains various fields to store data from 196 sensors. These fields may include sensor ID, sensor type, sensor value, timestamp, location, device ID, and other related information. In this specific scenario, the database administrator needs to perform a VACUUM operation on the 'SensorData' table. The VACUUM operation is a database maintenance task that reorganizes and cleans up the database by reclaiming space and optimizing performance. In this case, the VACUUM operation is performed to reclaim space after a large number of rows have been deleted or modified in the 'SensorData' table.Given that the 'SensorData' table contains 3,888,722 rows of data, with each row representing a reading from a sensor, and that there are 196 sensors producing data simultaneously, the VACUUM operation needs to be carried out using a multi-threaded approach with 196 threads. Each thread is responsible for cleaning up and optimizing a subset of the data in the table.The purpose of this VACUUM operation is to improve the storage efficiency and performance of the 'IoTDataDB' database. By reclaiming unused space, reorganizing data pages, and updating internal data structures, the VACUUM operation helps to reduce disk space consumption, enhance query performance, and maintain the overall health of the database.\n" + }, + "284": { + "start_time": "1697338208", + "end_time": "1697338322", + "start_timestamp": "2023-10-15 10:50:08", + "end_timestamp": "2023-10-15 10:52:02", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 90\n \n # Number of rows to insert\n num_rows = 692354\n \n # Size of each column (in characters)\n column_size = 57\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used for tracking customer orders in a restaurant, a large number of indexes are created for various attributes such as order number, customer name, and order date. However, these indexes are redundant and unnecessary, resulting in additional storage consumption and decreased query performance. This scenario simulates the impact of creating redundant indexes on a database table with 10 concurrent users performing various operations. The table contains 90 columns and 692,354 rows, with each column having a size of 57 characters.\n", + "desc": "In a business intelligence scenario, where large corporations analyze their financial data, there is a database called 'CorporateFinanceDB'. It is designed to store and process financial records and transactions. One of the tables in this database, 'FinancialRecords', holds detailed information about various financial transactions. This table contains a total of 692,354 rows of data, with each row representing a financial record. It has a total of 90 columns, each capable of holding up to 57 characters. These columns store information such as transaction ID, transaction type (income, expenditure, assets, liabilities), transaction amount, date, department, project code, financial year, audit status, and more.In the context of this scenario, redundant index creation is a common anomaly. To accelerate complex financial queries, the database administrator might create a large number of indexes before executing the queries. These indexes could be based on the type of transaction, date range, department, or project code. However, frequent creation and deletion of indexes can lead to increased storage usage and performance overhead. It can also cause database fragmentation, which further impacts the overall performance of the database. In a business intelligence setting, this can result in delayed generation of financial reports, thereby affecting the efficiency of decision-making processes.\n" + }, + "285": { + "start_time": "1697338382", + "end_time": "1697338473", + "start_timestamp": "2023-10-15 10:53:02", + "end_timestamp": "2023-10-15 10:54:33", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users upload, download, or edit files simultaneously, the file system experiences contention in input/output operations, resulting in slower file transfers.\n", + "desc": "In a file sharing system scenario, let's consider a database called 'TeamFileShareDB' that is used by teams or organizations to share files. This database not only stores the files themselves but also keeps track of metadata related to the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users may be simultaneously uploading, downloading, or editing files. This includes scenarios where a project team collaborates on completing an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing purposes. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the 'TeamFileShareDB' database faces challenges related to I/O (input/output) contention. When multiple users simultaneously upload or download large files, there might be significant strain on the system's storage and network bandwidth. This I/O contention can lead to slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing capabilities. Furthermore, the frequent write operations in the database (such as file uploads and metadata updates) can impact database performance. During peak periods, the database might encounter locking and transaction management issues, further delaying file processing and metadata recording.\n" + }, + "286": { + "start_time": "1697338533", + "end_time": "1697338594", + "start_timestamp": "2023-10-15 10:55:33", + "end_timestamp": "2023-10-15 10:56:34", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics platform, multiple users are performing join operations on a large dataset using multiple CPU cores simultaneously. Due to CPU contention, the performance of join operations is compromised, resulting in slower execution times.\n", + "desc": "In an IoT scenario, imagine a database named 'SmartHomeDB', which is used to store various data generated by sensors in a smart home. This data includes information from devices such as temperature sensors, motion sensors, smart appliances, and security cameras. The database contains multiple tables, one of which is a key table named 'SensorData', which records data from different sensors. This table consists of millions of rows of data, each row representing a sensor reading, with a total of 50 columns, each containing information of up to 100 characters. These columns may include sensor ID, sensor type, reading value, reading timestamp, sensor location, and other relevant data.In this scenario, the smart home system is constantly generating a large amount of sensor data. For example, temperature sensors might be recording readings every few seconds, motion sensors might be detecting movements, and smart appliances might be sending out status updates. This continuous stream of data can lead to an influx of new rows being inserted into the 'SensorData' table. If the database is not optimized for handling such a high insertion rate, it can result in poor join performance and CPU contention.The poor join performance occurs when queries require joining the 'SensorData' table with other tables to analyze the data. If the join keys are not properly indexed or if the queries are not optimized, it can result in slow query execution. This can be particularly problematic when multiple complex join queries are running simultaneously, leading to competition for CPU resources. The CPU contention occurs when the server's CPU resources are not sufficient to handle the compute-intensive queries, causing delays and performance degradation in the database.Overall, these issues can impact the efficiency and responsiveness of the smart home system, as queries and analysis of sensor data might take longer than expected. It can also result in delays in real-time processing of sensor data, affecting the functionality and performance of the IoT devices in the smart home.\n" + }, + "287": { + "start_time": "1697338654", + "end_time": "1697338803", + "start_timestamp": "2023-10-15 10:57:34", + "end_timestamp": "2023-10-15 11:00:03", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a scenario where a company is managing its inventory in a database, there is a need to fetch large amounts of data related to the inventory, specifically the amount of inventory for each product. This requires the execution of correlated subqueries, which can become inefficient when dealing with a large number of products. The provided script is a simulation of this scenario, where the \"FETCH_LARGE_DATA,CORRELATED_SUBQUERY\" anomaly is triggered to replicate the performance degradation caused by inefficient subqueries.\n", + "desc": "In an internet of things (IoT) scenario, there is a database called 'IoTDataDB' that stores various sensor data collected from smart devices. This database contains a main table called 'SensorReadings' which records data collected from multiple sensors. Each row in this table represents a reading from a specific sensor, and it contains information such as the sensor ID, reading type (temperature, humidity, pressure, etc.), value, timestamp, and location. In this specific scenario, due to the large volume of sensor data being collected at a high frequency, the database encounters performance issues when trying to fetch and process the data. This is primarily because the database lacks effective indexing and optimization techniques when performing fetch operations, particularly when dealing with correlated subqueries involving multiple columns and conditions. Consequently, the database faces challenges in efficiently retrieving and joining the required data from the 'SensorReadings' table, resulting in delays or inefficiencies in data retrieval and analysis.\n" + }, + "288": { + "start_time": "1697338863", + "end_time": "1697338935", + "start_timestamp": "2023-10-15 11:01:03", + "end_timestamp": "2023-10-15 11:02:15", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 101\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 17\n \n # Number of rows to insert\n num_rows = 71\n \n # Size of each column (in characters)\n column_size = 46\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, there are 101 sensors generating a large amount of data that needs to be inserted into the database simultaneously. Each sensor generates data with 17 columns, each column containing 46 characters. There are a total of 71 rows of data being inserted. This process simulates the database exception caused by the large influx of data.\n", + "desc": "In a file sharing system scenario, there is a database called 'FileShareDB' that is used for sharing files among users. The database stores the files themselves as well as metadata information such as file sizes, upload dates, and user permissions. When multiple users are simultaneously uploading or downloading files, the database might encounter performance issues. In this case, the user is attempting to insert a large amount of data into the database by running the script 'anomaly_trigger/main.py'. The script specifies the anomaly 'INSERT_LARGE_DATA', indicating that a large data insertion operation is being performed. The script also includes parameters such as 101 threads, indicating that 101 concurrent threads are being used for the insertion operation. Additionally, the parameters specify the number of columns (17), the size of each column (46 characters), and the number of rows (71). This large-scale data insertion, combined with concurrent threads, can potentially lead to performance bottlenecks, such as high CPU usage or contention for database resources.\n" + }, + "289": { + "start_time": "1697338995", + "end_time": "1697339066", + "start_timestamp": "2023-10-15 11:03:15", + "end_timestamp": "2023-10-15 11:04:26", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 101\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 37\n \n # Number of rows to insert\n num_rows = 53\n \n # Size of each column (in characters)\n column_size = 62\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, 101 sensors are generating a large amount of data that needs to be simultaneously inserted into a database. The database table has 37 columns and 53 rows, with each column being 62 characters in size. This task simulates the database exception caused by the high volume data insertion process.\n", + "desc": "In an IoT scenario, we can imagine a smart home automation system that uses a database called \"SmartHomeDB\" to store and manage data from various sensors and devices. This database contains a table named \"SensorData\" which records information from 53 different sensors, such as temperature, humidity, motion, light, and more. Each row in the table represents a specific sensor reading, with a total of 37 columns to store data such as sensor ID, reading type, value, timestamp, location, and status.In this particular scenario, there is a need to insert a large amount of sensor data into the \"SensorData\" table simultaneously. To simulate this, the script includes the parameter \"--anomaly INSERT_LARGE_DATA\" and sets the number of threads to 101. This means that 101 threads will be used to insert data into the database concurrently.However, this insertion operation might face challenges due to the high number of concurrent threads and the large amount of data being inserted. If proper optimization measures, such as bulk inserts or partitioning the data, are not implemented, this could lead to performance issues in the database. These issues might include high CPU and disk usage, increased write latency, and potentially even database lock contention. These anomalies could impact the overall efficiency and reliability of the smart home automation system.\n" + }, + "290": { + "start_time": "1697339126", + "end_time": "1697339187", + "start_timestamp": "2023-10-15 11:05:26", + "end_timestamp": "2023-10-15 11:06:27", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 73\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 82\n \n # Number of rows to insert\n num_rows = 208\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a social media platform, 73 users simultaneously attempt to perform frequent update operations in a database table containing 82 columns and 208 rows of user records, each with a column size of 56 characters. These users compete with each other to lock the database table and perform the update operation. The purpose of this simulation is to observe if any exceptions occur due to lock contention in the database.\n", + "desc": "In a life scenario involving file sharing, there is a database called 'TeamFileShareDB' which is a system used by teams or organizations for sharing files. This database stores both the files themselves and the metadata associated with them, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. Multiple users may be simultaneously uploading, downloading, or editing files within this system. For example, a project team collaborating on an important report may frequently upload the latest versions of files that other team members download for viewing or editing. These files can include large files such as presentations, video conference recordings, or design drawings. With such high concurrency in file operations, the 'TeamFileShareDB' database faces challenges with lock contention. When multiple users attempt to access the same or nearby rows within the database table simultaneously, there can be competition for locking the table, resulting in lock contention. If lock contention occurs frequently or for extended periods of time, it can lead to delayed or failed file operations, impacting the productivity and efficiency of the file sharing system.\n" + }, + "291": { + "start_time": "1697339247", + "end_time": "1697339316", + "start_timestamp": "2023-10-15 11:07:27", + "end_timestamp": "2023-10-15 11:08:36", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 158\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 17\n \n # Number of rows to insert\n num_rows = 2580822\n \n # Size of each column (in characters)\n column_size = 62\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database system of an online store, there is a system maintenance operation called VACUUM, which is used to clean up and reclaim the fragmented space in the database table. In this scenario, 158 users simultaneously perform searches in a table containing 17 columns and 2,580,822 rows. Each column has a size of 62 characters. The purpose of this simulation is to trigger a database exception caused by the VACUUM operation and the concurrent search operations.\n", + "desc": "In a business intelligence scenario, there is a database used for analyzing the financial data of a large corporation named 'CorporateFinanceDB'. This database contains a key table called 'FinancialRecords', which stores various financial transactions and statement information. The table consists of 2,580,822 rows of data, each representing a financial record, with a total of 17 columns. Each column can store information of up to 62 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and more. In certain situations, such as updating financial statements, clearing obsolete records, or consolidating data, a large-scale data cleanup operation, called VACUUM, needs to be performed in the database. This operation involves deleting a large number of rows, which can significantly impact the performance of the database. If proper optimization techniques, such as incremental deletion, batch processing, or performing the VACUUM operation during low traffic periods, are not implemented, these large-scale deletions could cause anomalies in the database. This could lead to delays or failures in other database operations, affecting the overall efficiency of the financial analysis process.\n" + }, + "292": { + "start_time": "1697339376", + "end_time": "1697339491", + "start_timestamp": "2023-10-15 11:09:36", + "end_timestamp": "2023-10-15 11:11:31", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 81\n \n # Number of rows to insert\n num_rows = 710858\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database for an online marketplace, there are 5 users executing a query that involves a large database table with 81 columns and 710,858 rows. Each column has a size of 51 characters. However, there are redundant indexes created for certain attributes, such as product name, category, and price range. The query performance may be affected due to the additional storage space and processing overhead caused by these redundant indexes.\n", + "desc": "In a business intelligence scenario, particularly within a large corporate environment, there is a database called 'BusinessAnalyticsDB' used for storing and analyzing various business data. This database contains multiple tables, including a key table named 'BusinessRecords', which records important business information. This table contains a large number of rows, specifically 710,858 rows, each representing a separate business record. It also has 81 columns, each capable of storing information up to 51 characters long. These columns might include record ID, business category, sales data, customer profiles, product information, market trends, and more. In a business intelligence analysis process, various complex queries are frequently conducted on the 'BusinessRecords' table to generate valuable business insights. To accelerate these queries, the database administrator may create multiple indexes before running the queries. These indexes could be based on different business attributes such as category, sales data, product information, or market trends. However, if these indexes are not carefully managed and maintained, or if they are not necessary for the specific queries being executed, they can become redundant and impact database performance. Furthermore, the frequent creation, deletion, or modification of indexes can result in additional storage usage, excessive indexing overhead, and potential database fragmentation. In a business intelligence environment, this could lead to decreased performance, slower query processing, and an overall inefficient decision-making process.\n" + }, + "293": { + "start_time": "1697339551", + "end_time": "1697339642", + "start_timestamp": "2023-10-15 11:12:31", + "end_timestamp": "2023-10-15 11:14:02", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are uploading, downloading, or editing files simultaneously, the system is experiencing a contention issue with input/output operations (IO). This contention slows down the file transfer process.\n", + "desc": "In a file transfer system scenario, the database 'TeamFileShareDB' is used for sharing files among teams or organizations. This database not only stores the files themselves but also keeps track of metadata such as uploader information, file size, creation/modification dates, version history, access permissions, and download counts. During a typical workday, multiple users are actively uploading, downloading, and editing files, especially large files such as presentations, video conference recordings, or design drawings. The high concurrency of these file operations poses a challenge in terms of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may become significantly strained. This I/O contention can lead to slower file transfer speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact the overall performance of the database. During peak periods, the database may experience issues with locking and transaction management, further slowing down file processing and metadata recording.\n" + }, + "294": { + "start_time": "1697339702", + "end_time": "1697339762", + "start_timestamp": "2023-10-15 11:15:02", + "end_timestamp": "2023-10-15 11:16:02", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system with poor join performance, there is a high contention for CPU resources. This leads to slow query execution and degraded performance when performing join operations.\n", + "desc": "In a business intelligence scenario, there is a database called 'CorporateAnalyticsDB' that stores and analyzes various business data for a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, analysts frequently need to perform complex join queries across these tables to generate comprehensive business reports. These join queries involve combining data from multiple tables to gain insights into various aspects of the company's operations. However, due to the size of these tables and the complexity of the join operations, the performance of these queries can be slow. If the join keys are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there may be competition for CPU resources, which can further degrade the query performance. This CPU contention can arise due to the presence of many compute-intensive queries running on the database server or if the CPU resources of the server are insufficient to handle the workload.\n" + }, + "295": { + "start_time": "1697339823", + "end_time": "1697339971", + "start_timestamp": "2023-10-15 11:17:03", + "end_timestamp": "2023-10-15 11:19:31", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, a performance issue may arise when querying the inventory of a large number of products due to the execution of related subqueries. This can be simulated by running the command \"python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY\".\n", + "desc": "In the database of an Internet of Things (IoT) scenario, suppose there is a database called 'SmartHomeDB', which is specifically designed for storing and processing data from various connected devices in a smart home environment. This database contains multiple tables, one of which is a key table named 'SensorReadings', recording various sensor data from devices such as temperature sensors, motion sensors, and light sensors. Suppose this table contains a large amount of data, with thousands or even millions of rows, and each row represents a sensor reading at a specific timestamp. The table has multiple columns, including sensor ID, sensor type, reading value, timestamp, device ID, and location information. In this scenario, fetching large amounts of sensor data for analysis or visualization purposes might be a common requirement. For example, an analyst may need to query and retrieve all temperature readings taken in the past month in order to analyze temperature trends within the smart home. The database administrator might also need to perform such queries to generate reports or visualize sensor data for troubleshooting purposes. However, when querying a large amount of sensor data, especially when involving complex conditions or subqueries, the performance of the database can be significantly impacted. If the queries involve correlated subqueries, where the result of one subquery depends on the result of another, the database might need to perform expensive computations or disk I/O operations for each correlated pair of subqueries, which can lead to slow query execution and increased resource consumption.\n" + }, + "296": { + "start_time": "1697340032", + "end_time": "1697340104", + "start_timestamp": "2023-10-15 11:20:32", + "end_timestamp": "2023-10-15 11:21:44", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 163\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 8\n \n # Number of rows to insert\n num_rows = 59\n \n # Size of each column (in characters)\n column_size = 48\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home automation system, 163 devices are simultaneously generating a large amount of data that needs to be inserted into the database. Each device has 8 data fields, with each field containing up to 48 characters. There are a total of 59 records being inserted into the database. This process aims to simulate the database exception caused by the insertion of such large data from multiple devices.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'SensorDataDB' used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors. Within this database, there is a key table called 'SensorReadings' that stores detailed information about sensor readings. This table consists of 59 rows of data, each representing a reading from a sensor, with a total of 8 columns, each containing information of up to 48 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this scenario, there is a need to simulate and test the database's performance when a large amount of sensor data is being inserted. The 'INSERT_LARGE_DATA' anomaly is triggered with the specified parameters, including 163 threads (simulating simultaneous insert operations), 8 columns with 48 characters each, and 59 rows of data. This simulates a situation where a large number of sensors are generating data simultaneously and sending it to the database for storage.The purpose of this test is to evaluate how well the database can handle such a high volume of data insertion and whether it can maintain acceptable performance levels. By triggering this anomaly, it allows for testing the database's ability to handle and process large-scale data insertions efficiently.\n" + }, + "297": { + "start_time": "1697340164", + "end_time": "1697340236", + "start_timestamp": "2023-10-15 11:22:44", + "end_timestamp": "2023-10-15 11:23:56", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 163\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 24\n \n # Number of rows to insert\n num_rows = 59\n \n # Size of each column (in characters)\n column_size = 74\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, a large amount of data generated by 163 sensors needs to be inserted into the smart home database simultaneously. Simulate the database exception caused by this process, where the smart home database table contains 24 columns, 59 rows, and each column has a size of 74 characters.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'IoTDataDB' that collects and stores data from various sensors. This database is designed to handle a large volume of data from these sensors. Within the database, there is a table named 'SensorData', which records information from different sensors. This table consists of 59 rows, each representing data from a particular sensor, with a total of 24 columns. These columns store information such as sensor ID, sensor type, reading value, timestamp, location, and status. In this specific case, the scenario involves inserting a large amount of data into the 'SensorData' table. This is done by simultaneously running 163 threads, where each thread generates data for a specific sensor. The data generated includes values for the 24 columns, with each column having a size of 74 characters. The purpose of this scenario could be to simulate a situation where numerous sensors are transmitting data to the database simultaneously, such as in a smart city environment or an industrial monitoring system. However, due to the large volume of data being inserted and the high concurrency of the insertion operation, there might be performance issues in the database. This can be caused by factors such as insufficient buffer size, contention for system resources, or inadequate indexing. These performance issues might result in increased insertion latency, decreased throughput, or even database errors during the insertion process.\n" + }, + "298": { + "start_time": "1697340296", + "end_time": "1697340357", + "start_timestamp": "2023-10-15 11:24:56", + "end_timestamp": "2023-10-15 11:25:57", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 177\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 89\n \n # Number of rows to insert\n num_rows = 292\n \n # Size of each column (in characters)\n column_size = 57\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 177 users simultaneously attempt to perform a frequent update operation in a database table containing 89 columns and 292 rows of product records each with a column size of 57 characters. Multiple users compete with each other to lock the database table to perform the update operation, which may lead to a database exception.\n", + "desc": "In a banking scenario, there is a database called 'BankingDB' that handles customer and transaction data for a bank. One key table in this database is 'AccountTransactions', which records detailed information about different banking transactions. This table contains 292 rows of data, each representing a transaction record for an account. The table has a total of 89 columns, with each column containing information of up to 57 characters. These columns include transaction ID, account number, transaction type, transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more. During a high-activity period, 177 users try to update the same or adjacent rows in this table. Due to the database's design and locking mechanism, these concurrent update operations can lead to contention for locking the database table. This contention can result in delayed processing or failure of transaction requests from other users, affecting the bank's daily operations. If such incidents occur frequently, they can cause the database transaction log to grow rapidly, consume excessive storage space, and even temporarily interrupt database services.\n" + }, + "299": { + "start_time": "1697340417", + "end_time": "1697340531", + "start_timestamp": "2023-10-15 11:26:57", + "end_timestamp": "2023-10-15 11:28:51", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 129\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 8\n \n # Number of rows to insert\n num_rows = 2785449\n \n # Size of each column (in characters)\n column_size = 74\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online store's database, there is a need to perform a large-scale data cleaning operation after which 129 users simultaneously search in a database table containing 8 columns and 2,785,449 rows of records, with each column having a size of 74 characters. This is done to simulate the scenario where, after a data cleaning operation, a large number of users search for products using different criteria such as product name, category, and price range, causing potential performance issues and possible exceptions in the database.\n", + "desc": "In an e-commerce platform, suppose there is a database specifically designed for handling sales data and product information, named 'ECommerceDB'. This database contains various tables, including the 'ProductDetails' table, which stores detailed information about different products. The 'ProductDetails' table contains a large number of rows (approximately 2,785,449) and eight columns, each capable of storing up to 74 characters. These columns may include product ID, name, price, stock quantity, description, brand, category, and image link.However, due to regular updates and changes in the product inventory, it becomes necessary to periodically perform a 'vacuum' operation on the database. This operation involves reclaiming unused space and organizing the data to optimize database performance. In this case, the 'vacuum' operation is performed with 129 concurrent threads, which allows for efficient processing of the large-scale data cleanup operation.By performing the 'vacuum' operation, the database can free up space and improve the performance of data retrieval and modification operations. This optimization ensures that the e-commerce platform can efficiently manage and serve large numbers of users and transactions.\n" + }, + "300": { + "start_time": "1697340591", + "end_time": "1697340706", + "start_timestamp": "2023-10-15 11:29:51", + "end_timestamp": "2023-10-15 11:31:46", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 7\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 58\n \n # Number of rows to insert\n num_rows = 614396\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database with 58 columns and 614,396 rows, each with a column size of 63 characters, a large number of indexes are created for items such as product name, category, and price range at the beginning of the query. Simulate the additional storage footprint and performance overhead caused by this process, with 7 users executing the query simultaneously.\n", + "desc": "In an IoT scenario, suppose there is a database named 'SensorDataDB', which is used to collect and analyze sensor data from various sensors. This database contains a table named 'SensorReadings', which stores data from a large number of sensors. The table consists of 614,396 rows of data, each row representing a reading from a sensor, with a total of 58 columns, each containing information of up to 63 characters. These columns may include sensor ID, sensor type, reading value, timestamp, location, and other related sensor data. In this scenario, the administrator of the database might have created multiple indexes to improve query performance. These indexes could be based on commonly queried columns such as sensor type, location, or timestamp. However, if the number of indexes created is excessive or redundant, it could lead to performance issues in the database. Redundant indexes can increase storage usage and require additional maintenance overhead. Moreover, frequent index operations can cause database fragmentation, impacting overall performance. In an IoT environment where sensor data is constantly being collected and analyzed, this can result in delayed data processing and analysis, affecting the efficiency of the entire system.\n" + }, + "301": { + "start_time": "1697340766", + "end_time": "1697340857", + "start_timestamp": "2023-10-15 11:32:46", + "end_timestamp": "2023-10-15 11:34:17", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are sharing files, there is a scenario where multiple users upload, download, or edit files simultaneously. This leads to I/O contention, which causes a slowdown in file transfer. This scenario is simulated using the command \"python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION\".\n", + "desc": "In a file sharing system, there is a database called 'FileShareDB' that is used for storing and managing files shared among users. This database contains important information about the files, such as file size, upload date, permission settings, and user information. There are multiple users who are constantly uploading, downloading, and modifying files in this system. For example, a team is working on a project and members frequently upload updated versions of files, while others download them for review or editing. Additionally, there may be large files, such as presentations or design drawings, being stored and shared through this system. Due to the high level of concurrent file operations, the database experiences input/output (I/O) contention. This means that when multiple users are simultaneously uploading or downloading large files, the system's storage and network bandwidth may become strained. This contention can result in slower file transfer speeds, especially when the system has limited bandwidth or insufficient server processing capabilities. Furthermore, the frequent write operations in the database, such as file uploads and updates to the file metadata, can impact the overall performance of the database. During peak usage periods, the database may experience issues with locking and transaction management, causing further delays in file processing and the recording of metadata.\n" + }, + "302": { + "start_time": "1697340917", + "end_time": "1697340977", + "start_timestamp": "2023-10-15 11:35:17", + "end_timestamp": "2023-10-15 11:36:17", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis scenario for a company, when joining large datasets with poor performance optimization, multiple CPU cores compete for computing resources, resulting in slower execution times for the join operation.\n", + "desc": "In a business intelligence scenario, specifically in the database of a large corporation, there is a database named 'CorporateAnalyticsDB' that contains various complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. These tables hold detailed business information, such as sales records, customer information, product data, and market trends. The company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they may need to join the 'SalesData' table with the 'CustomerProfiles' table to analyze customer purchasing behaviors, as well as link these data with the 'ProductCatalog' and 'MarketTrends' tables to gain deeper market insights. However, due to the large size of these tables and the complexity of the join operations, the performance of these queries can become slow. This can be exacerbated if the join keys are not effectively indexed or if the queries are not optimized. Additionally, during peak periods when multiple complex join queries are running simultaneously, there may be competition for CPU resources, which can further impact query efficiency. This CPU contention can occur if there are too many compute-intensive queries running on the database server or if the server's CPU resources are insufficient to handle the workload.\n" + }, + "303": { + "start_time": "1697341037", + "end_time": "1697341177", + "start_timestamp": "2023-10-15 11:37:17", + "end_timestamp": "2023-10-15 11:39:37", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, imagine an e-commerce platform that needs to track the inventory levels for each product. However, the process of retrieving this information requires executing related subqueries. This can be quite time-consuming, especially if there are a large number of products in the database. Without optimizing these related subqueries, the performance of the inventory query operation may significantly deteriorate.\n", + "desc": "In the database of an e-commerce platform, there is a database named 'ECommerceDB', which includes a crucial table named 'ProductInventory' for recording the inventory information of various products. This table might contain inventory data for tens of thousands or even hundreds of thousands of products. The inventory information for each product includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In this database, querying the inventory level of each product may require performing related subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query might first involve selecting all products of a particular category from the 'ProductDetails' table, then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. When the number of products is very large, the performance of these related subqueries can become inefficient. For instance, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. In such cases, due to the need to retrieve inventory information for a large number of products, the database might need to read a significant amount of data from the disk, which could lead to I/O bottlenecks.\n" + }, + "304": { + "start_time": "1697341238", + "end_time": "1697341309", + "start_timestamp": "2023-10-15 11:40:38", + "end_timestamp": "2023-10-15 11:41:49", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 144\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 90\n \n # Size of each column (in characters)\n column_size = 26\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a system where 144 devices are simultaneously inserting a large amount of data, each device is inserting 90 rows with 10 columns, where each column has a size of 26 characters. This scenario simulates the database exception caused by the high data insertion rate.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database named 'SensorDataDB' that is designed to store sensor data collected from various devices in a smart home. This database consists of multiple tables, one of which is named 'SensorReadings' and it records data from different sensors. The 'SensorReadings' table contains information such as sensor ID, sensor type, reading value, timestamp, and sensor location. In this particular scenario, there are 90 rows of data in the table, with each row representing a reading from a sensor. The table has 10 columns, each capable of storing up to 26 characters of data. Now, suppose at a specific moment, there is a surge in sensor data being generated, with multiple sensors simultaneously transmitting data to the database. This generates a large number of write requests to insert data into the 'SensorReadings' table. The high concurrency of these insert operations can potentially lead to performance issues, especially if the database is not optimized to handle such a large volume of data. Inefficient indexing, lack of proper buffering mechanisms, or improper partitioning can contribute to slower write speeds and increased latency. This can result in anomalies in the database, such as delayed or failed insertions, which may impact the real-time monitoring or automation systems relying on this data.\n" + }, + "305": { + "start_time": "1697341369", + "end_time": "1697341441", + "start_timestamp": "2023-10-15 11:42:49", + "end_timestamp": "2023-10-15 11:44:01", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 144\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 39\n \n # Number of rows to insert\n num_rows = 65\n \n # Size of each column (in characters)\n column_size = 85\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial system, 144 financial transactions are being inserted into a database table simultaneously. Each transaction has 39 columns with a size of 85 characters, and there are a total of 65 transactions. This process is simulating the database exception that can occur when inserting a large amount of data.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database called 'IoTDataDB' that stores sensor data collected from various IoT devices. This database is designed to handle a large volume of data from different types of sensors. One of the key tables in this database is 'SensorReadings', which contains information about the readings from these sensors. This table consists of 65 rows, each representing a reading from a specific sensor, with a total of 39 columns, each containing information of up to 85 characters. These columns may include sensor ID, sensor type (such as temperature, humidity, pressure, etc.), reading value, timestamp, location, and other related attributes. In this specific scenario, the database is facing an issue during the insertion of a large amount of data. At a given time, 144 IoT devices are simultaneously transmitting data to the 'SensorReadings' table, which requires inserting thousands of rows. However, due to various factors such as inefficient data buffering, lack of suitable indexing, or inadequate database parameters, the database's ability to handle these concurrent insertions is limited. As a result, the database might face performance problems, including increased write latency and potential data inconsistencies.These anomalies can impact the overall efficiency and reliability of the IoT system, potentially leading to delayed or lost data, inaccurate analytics, or even system failures. It is essential to optimize the database configuration, data insertion process, and indexing strategies to ensure smooth and efficient data processing within the IoT environment.\n" + }, + "306": { + "start_time": "1697341501", + "end_time": "1697341562", + "start_timestamp": "2023-10-15 11:45:01", + "end_timestamp": "2023-10-15 11:46:02", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 156\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 78\n \n # Number of rows to insert\n num_rows = 218\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, if 156 users simultaneously attempt to perform frequent update operations in a database table with 78 columns and 218 rows of records, each with a column size of 99 characters, and compete to lock the table during the update process, it could lead to a database exception due to lock contention.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that stores sensor data from various devices. One important table in this database is called 'SensorReadings', which contains information about the readings from different sensors. This table has 218 rows, each corresponding to a sensor reading, and a total of 78 columns, each capable of storing up to 99 characters. These columns include sensor ID, sensor type, reading value, timestamp, location, and other relevant data. In a particular situation, 156 devices are simultaneously sending sensor data to the 'SensorReadings' table. Due to the high volume of data being written, the database encounters performance issues related to locking. This means that when multiple devices try to write data simultaneously, there is contention for resources, causing delays or even failures in writing the data. This lock contention affects the database's ability to efficiently handle concurrent write operations, potentially leading to anomalies.\n" + }, + "307": { + "start_time": "1697341622", + "end_time": "1697341685", + "start_timestamp": "2023-10-15 11:47:02", + "end_timestamp": "2023-10-15 11:48:05", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 198\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2366643\n \n # Size of each column (in characters)\n column_size = 85\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online store database, there is a need to optimize the performance of search queries made by 198 users. The database table contains 10 columns and 2,366,643 rows, with each column having a size of 85 characters. By simulating a VACUUM operation, which involves the reorganization and optimization of the database storage, we can trigger a database exception to evaluate the impact on search performance.\n", + "desc": "In an online store's database, named 'OnlineStoreDB', there is a key table called 'ProductRecords' that stores detailed information about different products available for sale. This table contains a total of 2,366,643 rows, with each row representing a unique product entry. The table consists of 10 columns, where each column can hold information up to 85 characters long. These columns include attributes such as product ID, name, price, stock quantity, description, brand, category, size, color, and weight.In this particular scenario, the database administrator needs to perform a 'VACUUM' operation. The 'VACUUM' operation is used to reclaim unused space and improve the performance of the database. It involves restructuring the storage of database files, removing any fragmentation, and reclaiming space from deleted or updated data. However, due to the large scale of the table (2,366,643 rows), the 'VACUUM' operation might have a significant impact on the database's performance. If not properly managed, it could potentially slow down other database operations such as inserting new products, updating prices, or processing user queries. Therefore, in order to minimize the impact on the overall system, it is important to carefully plan and optimize the 'VACUUM' operation, considering factors like workload, traffic, and resource availability.\n" + }, + "308": { + "start_time": "1697341745", + "end_time": "1697341861", + "start_timestamp": "2023-10-15 11:49:05", + "end_timestamp": "2023-10-15 11:51:01", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 75\n \n # Number of rows to insert\n num_rows = 471392\n \n # Size of each column (in characters)\n column_size = 96\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database, 10 users simultaneously perform a query operation on a database table with 75 columns and 471,392 rows of product records. Each column has a size of 96 characters. The query operation involves creating redundant indexes for items such as product name, category, and price range before the query and deleting them after the query. This simulates the additional storage footprint and performance overhead caused by creating and deleting redundant indexes.\n", + "desc": "In an e-commerce scenario, there is a database for an online store called 'OnlineStoreDB'. This database stores various information about products and is used to support online shopping. One of the tables in this database is 'ProductCatalog', which contains detailed information about different products. This table consists of 471,392 rows of data, each representing a different product, with a total of 75 columns. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes.In order to improve the performance of queries and reports related to product analysis, the database administrator decides to create multiple indexes on the 'ProductCatalog' table. These indexes could be based on different columns such as category, brand, or price. However, due to improper planning or a lack of understanding of the data access patterns, the administrator creates redundant indexes that do not provide any additional benefit.The presence of these redundant indexes can negatively impact the database's overall performance. It can result in increased storage consumption, as the redundant indexes occupy additional space. Moreover, during data modification operations such as inserting or updating product records, maintaining these redundant indexes can incur unnecessary overhead, slowing down the overall database performance. Additionally, if the update frequency of the 'ProductCatalog' table is high, the redundant indexes may become inconsistent with the actual data, leading to data integrity issues. Therefore, the presence of redundant indexes can have a detrimental effect on the performance and efficiency of the entire online store's database system.\n" + }, + "309": { + "start_time": "1697341921", + "end_time": "1697342012", + "start_timestamp": "2023-10-15 11:52:01", + "end_timestamp": "2023-10-15 11:53:32", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing platform where multiple users are sharing files, there is a situation where multiple users are uploading, downloading, or editing files at the same time. This results in competition for input/output (I/O) resources, which leads to a slowdown in the file transfer process. The given command is used to simulate and trigger this scenario.\n", + "desc": "In a file-sharing system, let's imagine there is a database called 'TeamFileShareDB', which is used by teams or organizations for sharing files. This database not only stores the files themselves but also records metadata such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical day, multiple users may simultaneously upload, download, or edit files. For example, a project team collaborating on an important report frequently uploads the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the 'TeamFileShareDB' database faces challenges related to input/output (I/O) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth become significantly strained. This I/O contention can result in slower file transfer speeds, particularly when there is limited bandwidth or insufficient server processing capabilities. Moreover, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter issues with locking and transaction management, further slowing down file processing and metadata recording.\n" + }, + "310": { + "start_time": "1697342072", + "end_time": "1697342132", + "start_timestamp": "2023-10-15 11:54:32", + "end_timestamp": "2023-10-15 11:55:32", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis system, there is a poor performance of data joining due to high CPU contention. This can occur when multiple processes or tasks compete for CPU resources, causing delays in executing join operations.\n", + "desc": "In a Business Intelligence (BI) scenario, imagine a database named 'CorporateAnalyticsDB', used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights. Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "311": { + "start_time": "1697342192", + "end_time": "1697342341", + "start_timestamp": "2023-10-15 11:56:32", + "end_timestamp": "2023-10-15 11:59:01", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, when trying to fetch a large amount of product data, related subqueries are used to gather additional information about each product. However, if these subqueries are not optimized, the performance of the overall data retrieval process may be negatively impacted.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' that stores various information about products. Within this database, there is a table named 'ProductInventory' that records the inventory information of thousands or even hundreds of thousands of products. This table includes data such as product ID, current stock level, last inventory update time, supplier ID, and warehouse location. When conducting queries to determine the inventory level of specific products or categories, related subqueries are performed. These subqueries can be time-consuming and inefficient, especially when dealing with a large number of products. As a result, the database might experience I/O bottlenecks when reading a significant amount of data from the disk to retrieve inventory information for a large number of products.\n" + }, + "312": { + "start_time": "1697342402", + "end_time": "1697342474", + "start_timestamp": "2023-10-15 12:00:02", + "end_timestamp": "2023-10-15 12:01:14", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 174\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 78\n \n # Size of each column (in characters)\n column_size = 73\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, such as a financial system, a large number of transactions need to be processed simultaneously. This results in the insertion of a large amount of data into the database. Specifically, 174 threads are used to insert data into a table with 5 columns, each with a size of 73 characters, containing 78 rows. The purpose of this script is to simulate the database exception that may occur during this process.\n", + "desc": "In a file sharing system scenario, there is a database called 'FileShareDB', which is used by teams or organizations to share files. This database stores the files themselves and also records metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. In this case, many users are simultaneously uploading and downloading files, with a particularly high number of upload operations. These file uploads can involve large files, such as presentations, video conference recordings, or design drawings. With 174 upload threads running simultaneously, each thread processing 78 rows of data in the 'FileShareDB' database, there might be performance issues due to the database being unable to handle the high volume of concurrent write requests. This can result in slower upload speeds, delays, or failures in file transfers. Additionally, the database might encounter storage and I/O contention problems, as the system's storage and network bandwidth may be strained. This can further impact upload performance and the overall efficiency of the file sharing system.\n" + }, + "313": { + "start_time": "1697342534", + "end_time": "1697342606", + "start_timestamp": "2023-10-15 12:02:14", + "end_timestamp": "2023-10-15 12:03:26", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 174\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 38\n \n # Number of rows to insert\n num_rows = 88\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an industrial manufacturing process, data from 174 sensors needs to be inserted into a database. Each sensor generates data with 38 attributes, each with a size of 50 characters. This process is simulated by running the script, causing database exceptions due to the large amount of data being inserted.\n", + "desc": "In the IoT scenario, there is a database dedicated to collecting and managing sensor data, named 'SensorDataDB'. This database is designed to handle data from various sensors and IoT devices. In this case, we have a table named 'SensorReadings' that stores readings from 88 sensors. Each row represents a reading and contains 38 columns, each capable of holding up to 50 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information.Now, let's consider a situation where 174 sensors are simultaneously transmitting data to the database at a high frequency. Due to the large volume of incoming data, the database might encounter performance issues. This could be due to inadequate data partitioning, lack of buffering mechanisms, or insufficient indexing. These issues can hinder the database's ability to handle a large number of concurrent write requests, resulting in increased write latency and even database locking.These anomalies can negatively impact the real-time data processing and analysis capabilities of the system, affecting the overall efficiency and effectiveness of the IoT applications relying on this database.\n" + }, + "314": { + "start_time": "1697342666", + "end_time": "1697342726", + "start_timestamp": "2023-10-15 12:04:26", + "end_timestamp": "2023-10-15 12:05:26", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 122\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 56\n \n # Number of rows to insert\n num_rows = 221\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online platform's database, 122 users simultaneously attempt to perform frequent update operations in a database table containing 56 columns and 221 rows of records for a duration of time, with each column having a size of 99 characters. These users compete with each other to lock the database table, resulting in a contention for resources and potential exceptions in the database.\n", + "desc": "In a database named 'BankingDB', designed to handle customer and transaction data for a bank, there is a main table called 'AccountTransactions' that records detailed information about various banking transactions. This table contains 221 rows of data, each representing a transaction record for an account. It consists of 56 columns, each containing information of up to 99 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more. During a busy period, 122 users simultaneously attempt frequent update operations on the 'AccountTransactions' table. These operations could include modifying transaction amounts, updating transaction statuses, or adding transaction notes. Due to the table's design and the database's locking mechanism, multiple users trying to update the same or adjacent rows at the same time can lead to a competition for locking the database table. If this locking contention lasts for a considerable period, it could cause performance issues in the database. It might result in delayed processing or failure of other users' transaction requests, affecting the daily operations of the bank. Additionally, if such incidents occur frequently, the database transaction log could grow rapidly, consuming excessive storage space and potentially causing temporary interruptions in database services.\n" + }, + "315": { + "start_time": "1697342786", + "end_time": "1697342875", + "start_timestamp": "2023-10-15 12:06:26", + "end_timestamp": "2023-10-15 12:07:55", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 61\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2784402\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a social media platform, if 61 users simultaneously perform a search after a data cleaning operation on a database table containing 5 columns, 2,784,402 rows, each column size of 50 characters, an exception might occur due to the high demand on the database resources.\n", + "desc": "In a file sharing system like 'TeamFileShareDB', which is used by teams or organizations to share files, there is a database named 'TeamFileShareDB' that stores the files and their metadata. This database contains a key table named 'FileMetadata' which stores information about each file, such as the file name, size, upload date, uploader information, and download count. During the usage of this system, multiple users might simultaneously upload, download, or update files, resulting in a large volume of file-related operations. In such scenarios, the 'TeamFileShareDB' might encounter performance issues due to excessive fragmentation and inefficient storage utilization caused by repeated file deletions. These issues can arise from the lack of automatic data compression or database vacuuming mechanisms. If the database is not properly maintained, it may consume significant storage space and lead to poor overall performance. Additionally, due to the lack of database vacuuming, the database index might become fragmented, leading to slower query execution and reduced overall system efficiency. So, executing the provided script specifically triggers the 'VACUUM' anomaly, which simulates the need for a database vacuum operation to address these issues by reclaiming unused space and reorganizing data and indexes.\n" + }, + "316": { + "start_time": "1697342935", + "end_time": "1697343049", + "start_timestamp": "2023-10-15 12:08:55", + "end_timestamp": "2023-10-15 12:10:49", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 75\n \n # Number of rows to insert\n num_rows = 868525\n \n # Size of each column (in characters)\n column_size = 88\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a financial database with 75 columns and 868,525 rows, each with a column size of 88 characters, a large number of indexes are created for different financial metrics at the beginning of a query. Nine users then perform various search operations on the database, and the indexes are deleted after the queries are finished. This simulates the additional storage and performance overhead caused by redundant indexes.\n", + "desc": "In an e-commerce platform, there is a database named 'ECommerceDB' specifically used for storing and managing product records. Within this database, there is a table called 'ProductDetails' which contains a large amount of data related to various products. This table consists of 868,525 rows, with each row representing a unique product entry. The table has 75 columns, each containing information of up to 88 characters. These columns may include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes. In this scenario, an anomaly can occur due to the presence of redundant indexes in the database. Redundant indexes are created to accelerate queries, particularly for financial analysis purposes. However, these redundant indexes can lead to additional storage usage and performance overhead in the database. Additionally, frequent index operations can cause database fragmentation, further impacting performance. Therefore, when nine users simultaneously query the 'ProductDetails' table, the presence of redundant indexes could slow down the query process, resulting in delayed report generation and affecting the efficiency of decision-making processes in the business intelligence context of the e-commerce platform.\n" + }, + "317": { + "start_time": "1697343109", + "end_time": "1697343200", + "start_timestamp": "2023-10-15 12:11:49", + "end_timestamp": "2023-10-15 12:13:20", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a shared file storage system, multiple users are simultaneously uploading, downloading, or editing files. This leads to contention for input/output (I/O) resources, resulting in slower file transfer speeds.\n", + "desc": "In the file sharing system scenario, there is a database called 'FileShareDB' that is used by a team or organization for sharing files. The database stores both the files themselves and the metadata associated with the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During a typical workday, multiple users may be uploading, downloading, or editing files simultaneously. For example, a project team might be collaborating on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system allows for storage and sharing of large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency of file operations, the database 'FileShareDB' faces challenges in terms of input/output (I/O) contention. When multiple users concurrently upload or download large files, the system's storage and network bandwidth can become strained. This I/O contention leads to slower file transfer speeds, especially in situations with limited bandwidth or insufficient server processing capabilities. Additionally, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may experience locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "318": { + "start_time": "1697343260", + "end_time": "1697343320", + "start_timestamp": "2023-10-15 12:14:20", + "end_timestamp": "2023-10-15 12:15:20", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis scenario, multiple threads attempt to perform a join operation on a large dataset, causing poor performance due to high CPU contention.\n", + "desc": "In the business intelligence scenario, imagine a database named 'CorporateAnalyticsDB' used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights.Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "319": { + "start_time": "1697343380", + "end_time": "1697343520", + "start_timestamp": "2023-10-15 12:16:20", + "end_timestamp": "2023-10-15 12:18:40", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, there is a scenario where the system needs to fetch a large amount of data and execute correlated subqueries to retrieve inventory information for each product. However, if these subqueries are not optimized, the performance of the system can degrade, causing delays in retrieving inventory data.\n", + "desc": "In an IoT scenario, let's consider a sensing system database named 'SensorDataDB'. This database is used to store and analyze data collected from various sensors. One of the key tables in this database is named 'SensorReadings', which records detailed information about the sensor readings. This table contains multiple fields including sensor ID, reading type, reading value, timestamp, location, and status. When conducting complex queries, such as comparing readings from different sensors or analyzing trends over time, it is often necessary to use correlated subqueries. These subqueries help to retrieve data related to specific sensor readings or perform comparisons between different sets of data. However, if the database contains a large amount of sensor data, and the subqueries are not efficiently executed, it can lead to poor performance. The execution of correlated subqueries might require accessing a large amount of data from disk, leading to potential I/O bottlenecks and slowdowns in query processing. Therefore, in this scenario, fetching large amounts of sensor data and utilizing correlated subqueries can impact the performance of the database.\n" + }, + "320": { + "start_time": "1697343581", + "end_time": "1697343653", + "start_timestamp": "2023-10-15 12:19:41", + "end_timestamp": "2023-10-15 12:20:53", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 182\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 11\n \n # Number of rows to insert\n num_rows = 50\n \n # Size of each column (in characters)\n column_size = 26\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analytics platform, 182 users are simultaneously inserting a large amount of data into a database table. The table has 11 columns, each with a size of 26 characters, and contains 50 rows of data. This process is designed to simulate the database exception that can occur when multiple users are trying to insert a large volume of data at the same time.\n", + "desc": "In an e-commerce platform, there is a database named 'ECommerceDB' that stores information about various products. One of the key tables in this database is 'ProductData', which contains data about different products. This table has 50 rows of data, with each row representing a product entry. The table has 11 columns, each containing information up to 26 characters long. These columns may include product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, and number of reviews.In this scenario, there is a requirement to insert a large amount of data into the 'ProductData' table. A total of 182 threads are used to perform this operation. Each thread is responsible for inserting data into the table. However, due to the high number of concurrent insert operations and the amount of data being inserted, the database might face performance issues. Without appropriate optimization techniques such as batch processing, efficient transaction management, or using bulk insert methods, the insertion process could become slower and inefficient. This might result in delays in inserting the data and could potentially lead to anomalies in the database.These anomalies could impact other operations in the e-commerce platform, such as product searches, inventory management, or user queries. It is important to carefully monitor and optimize the insertion process to ensure smooth database operations and maintain the efficiency of the overall platform.\n" + }, + "321": { + "start_time": "1697343713", + "end_time": "1697343786", + "start_timestamp": "2023-10-15 12:21:53", + "end_timestamp": "2023-10-15 12:23:06", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 182\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 30\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 94\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analysis scenario, there are 182 data analysts working simultaneously, trying to insert a large amount of data into a database table. The table has 30 columns, with each column size being 94 characters, and there are 68 rows of data. This scenario simulates the database exception that can occur when multiple users try to insert a large amount of data into the database at the same time.\n", + "desc": "In an Internet of Things (IoT) scenario, where sensor data is collected and analyzed, there is a database called 'SensorDataDB'. This database is specifically designed to handle a large volume of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which records the readings from these sensors. The table has 68 rows of data, with each row representing a reading and containing 30 columns, each with up to 94 characters of information. These columns may include sensor ID, reading type, reading value, timestamp, location, and status. In this scenario, there are 182 sensors that start transmitting data simultaneously at a high frequency. Due to the large amount of concurrent data being inserted into the 'SensorReadings' table, as well as potential issues such as insufficient buffering or indexing, the database may experience performance issues. The insertion of large amounts of data in a short period of time can lead to increased write latency and potential database locking, which can result in anomalies. These anomalies can impact the efficiency and accuracy of the sensor data analysis, as well as the overall functionality of the IoT system.\n" + }, + "322": { + "start_time": "1697343846", + "end_time": "1697343906", + "start_timestamp": "2023-10-15 12:24:06", + "end_timestamp": "2023-10-15 12:25:06", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 134\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 66\n \n # Number of rows to insert\n num_rows = 204\n \n # Size of each column (in characters)\n column_size = 61\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online database for a blog platform, 134 users are simultaneously attempting to perform frequent update operations on a database table that contains 66 columns and 204 rows of blog records. Each blog record has a column size of 61 characters. These users are competing with each other to lock the database table and perform the update operations. The duration of this competition is not specified in the command. This scenario simulates a database exception caused by contention for locking the table during the update operations.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'SensorDataDB' that is used to collect and analyze data from various sensors. This database is designed to handle a large volume of sensor data. One of the key tables in this database is 'SensorReadings', which records information about the readings from 204 sensors. Each sensor reading includes data such as sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this particular scenario, 134 users are simultaneously trying to access and update the 'SensorReadings' table. These users might be IoT devices, automated systems, or even humans interacting with the system. Due to the high number of concurrent access requests, there might be contention for database locks, resulting in delays or failures in processing these requests. This contention can lead to anomalies in the database and might affect the real-time monitoring capabilities or analysis of the collected sensor data.\n" + }, + "323": { + "start_time": "1697343966", + "end_time": "1697344004", + "start_timestamp": "2023-10-15 12:26:06", + "end_timestamp": "2023-10-15 12:26:44", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 180\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 11\n \n # Number of rows to insert\n num_rows = 3176852\n \n # Size of each column (in characters)\n column_size = 94\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a website that sells various products, 180 users simultaneously perform a search operation after a large-scale data cleaning operation on a database table containing 11 columns, 3,176,852 rows. Each column has a size of 94 characters.\n", + "desc": "In the e-commerce database of an online store, named 'OnlineStoreDB', there is a table called 'ProductInformation' that stores detailed information about various products. This table contains 3,176,852 rows of data, with each row representing a specific product. Additionally, there are 11 columns in this table, each capable of storing information up to 94 characters long. These columns might include product ID, name, price, stock quantity, description, brand, category, size, color, weight, and production date. During a specific period, 180 users conduct simultaneous searches for products using the online store's search interface. However, due to the high volume of search queries and the lack of necessary indexes on commonly used search columns, such as name, brand, and category, the database becomes inefficient in handling these concurrent search requests. This inefficiency can result in anomalies within the database, leading to a decline in user experience and negatively affecting the overall operational efficiency of the online store.\n" + }, + "324": { + "start_time": "1697344064", + "end_time": "1697344178", + "start_timestamp": "2023-10-15 12:27:44", + "end_timestamp": "2023-10-15 12:29:38", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 82\n \n # Number of rows to insert\n num_rows = 705049\n \n # Size of each column (in characters)\n column_size = 85\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online store's database with 82 columns and 705,049 rows of product records, each with a column size of 85 characters, a large number of redundant indexes are created for various product attributes. This can lead to additional storage space usage and performance overhead.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'IoTDataDB', which is designed to store and process large volumes of data generated by various IoT devices. This database contains a key table named 'SensorData', which is responsible for storing sensor data from different devices. The 'SensorData' table consists of 705,049 rows of data, with each row representing a data record from a sensor. The table has 82 columns, each capable of storing information up to 85 characters. These columns may include sensor ID, temperature, humidity, pressure, light intensity, motion detection, location, timestamp, and more. Due to the nature of IoT devices and their continuous data generation, the 'SensorData' table requires efficient indexing to handle queries quickly. However, in some cases, the database administrator might create redundant indexes, meaning indexes that provide duplicate or unnecessary information for query acceleration. These redundant indexes, if not managed properly, can result in additional storage usage and increased maintenance overhead. Furthermore, redundant indexes may impact the performance of queries, as the database has to process unnecessary index updates during write operations. In the IoT environment, where data ingestion and processing rates are high, these redundant indexes could lead to slower query response times and database performance degradation.\n" + }, + "325": { + "start_time": "1697344238", + "end_time": "1697344329", + "start_timestamp": "2023-10-15 12:30:38", + "end_timestamp": "2023-10-15 12:32:09", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are uploading, downloading, or editing files simultaneously, the system is experiencing contention for input/output (I/O) resources. This contention slows down the file transfer process.\n", + "desc": "In a file sharing system scenario, such as a cloud storage service used by teams or organizations, there is a database named 'TeamFileShareDB', which is responsible for storing and managing the files shared among team members. This database not only stores the actual files but also keeps track of metadata such as file size, upload and modification dates, access permissions, and download counts. On a typical day, multiple users might be uploading, downloading, or editing files simultaneously. For example, a project team might be collaborating on a document, with team members constantly uploading new versions of the file while others download or edit it. Additionally, the system might be used for storing and sharing large files like presentations, video recordings, or design drawings.Due to the high concurrency of file operations, the 'TeamFileShareDB' database can face challenges related to I/O contention. When multiple users upload or download large files at the same time, the system's storage and network bandwidth can become strained. This contention can lead to slower file transfer speeds, especially if there is limited bandwidth or the server processing capabilities are insufficient. Furthermore, the frequent write operations in the database, such as file uploads and metadata updates, can impact the overall performance of the database. During peak periods with heavy user activity, the database might experience locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "326": { + "start_time": "1697344389", + "end_time": "1697344449", + "start_timestamp": "2023-10-15 12:33:09", + "end_timestamp": "2023-10-15 12:34:09", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics platform, several users are simultaneously executing join operations on large datasets. Due to inefficient join algorithms and limited CPU resources, the system experiences poor performance and high CPU contention.\n", + "desc": "In a Business Intelligence (BI) scenario, there is a database named 'CorporateAnalyticsDB' that stores and analyzes various business data for a large corporation. This database consists of multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', which contain a large amount of detailed business information. Analysts in the company frequently need to perform complex join queries across these tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table, which contains sales records, with the 'CustomerProfiles' table, which contains customer information, to analyze the purchasing behaviors of different customer groups. They might also need to link this data with the 'ProductCatalog' table and 'MarketTrends' table to gain deeper market insights. However, due to the large size of these tables and the involved multi-table joins, these queries can become very slow. If the join keys in these tables are not effectively indexed or if the queries are not properly optimized, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there could be competition for CPU resources, leading to a decrease in query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server or the server's CPU resources being insufficient to handle the load.\n" + }, + "327": { + "start_time": "1697344509", + "end_time": "1697344658", + "start_timestamp": "2023-10-15 12:35:09", + "end_timestamp": "2023-10-15 12:37:38", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, this command is simulating a situation in an e-commerce platform's database where there is a need to retrieve a large amount of data regarding inventory for each product. This retrieval requires the execution of related subqueries. However, if these subqueries are not optimized, the performance of the inventory query may suffer, leading to slower execution times.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' that stores information about various products. Within this database, there is a table called 'ProductInventory' that contains inventory data for a large number of products. This data includes the product ID, current stock level, last update time, supplier ID, and warehouse location. In this database, there is a common query that involves performing subqueries to retrieve inventory information for products within a specific category. However, when the number of products is very large, executing these subqueries can be inefficient and time-consuming. This is because each subquery needs to be executed individually for each product, leading to a significant amount of disk IO and potentially causing I/O bottlenecks.\n" + }, + "328": { + "start_time": "1697344719", + "end_time": "1697344791", + "start_timestamp": "2023-10-15 12:38:39", + "end_timestamp": "2023-10-15 12:39:51", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 158\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 7\n \n # Number of rows to insert\n num_rows = 51\n \n # Size of each column (in characters)\n column_size = 41\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data logging system, 158 sensors are generating a large amount of data simultaneously. This data needs to be inserted into the database, which has a table with 7 columns and 51 rows of data for each sensor. Each column has a size of 41 characters. Simulate the database exception caused by this process.\n", + "desc": "In a file sharing system, there is a database named 'TeamFileShareDB' that is used for sharing files among teams or organizations. The database stores both the files themselves and the metadata associated with them, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. In this scenario, 158 users simultaneously attempt to upload large files to the system, with each file having 7 columns of metadata, each containing information of up to 41 characters. These columns may include file ID, file name, file type, file size, file upload date, uploader information, and file status. Due to the high concurrency in file upload operations, the database can face challenges in handling these numerous concurrent write requests efficiently. This could lead to slower file transfer speeds and potential performance issues, especially if the system's storage and server processing capabilities are not sufficient. Additionally, the database might experience contention for system resources, such as CPU or I/O (input/output), further impacting the overall performance and responsiveness of the file sharing system.\n" + }, + "329": { + "start_time": "1697344851", + "end_time": "1697344923", + "start_timestamp": "2023-10-15 12:40:51", + "end_timestamp": "2023-10-15 12:42:03", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 158\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 28\n \n # Number of rows to insert\n num_rows = 56\n \n # Size of each column (in characters)\n column_size = 58\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data processing system used for scientific research, 158 sensors are simultaneously collecting data and inserting it into a database. Each sensor generates data with 28 attributes, and each attribute has a size of 58 characters. The database table has a total of 56 rows. Simulate the database exception caused by this process.\n", + "desc": "In a file sharing system, suppose there is a database called 'TeamFileShareDB' that is used by teams or organizations to share files. This database not only stores the files themselves but also records metadata about the files such as uploader information, file size, creation date, modification date, version history, access permissions, and download counts. In this scenario, a large number of files need to be uploaded simultaneously. This could be due to a team working on a project where multiple team members are uploading files or due to the need to share a large number of files at once. The database may encounter performance issues when trying to handle the large volume of concurrent upload requests. This could be because the database server's processing power is not sufficient to handle the increased workload, or it could be due to limitations in the storage system, such as insufficient disk space or slow I/O performance. As a result, the upload process may be slow or even fail, affecting the efficiency of file sharing and collaboration within the team or organization.\n" + }, + "330": { + "start_time": "1697344983", + "end_time": "1697345043", + "start_timestamp": "2023-10-15 12:43:03", + "end_timestamp": "2023-10-15 12:44:03", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 131\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 99\n \n # Number of rows to insert\n num_rows = 305\n \n # Size of each column (in characters)\n column_size = 72\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a system where multiple users are accessing a database, 131 users are competing to update a table with 99 columns and 305 rows of records, each column having a size of 72 characters. These users are trying to lock the database table in order to perform the update operation simultaneously, which may result in contention and lead to database exceptions.\n", + "desc": "In a banking scenario, there is a database called 'BankingDB' that is responsible for handling customer and transaction data for a bank. This database contains a key table called 'AccountTransactions' which stores detailed information about various banking transactions. It consists of 305 rows of data, each representing a transaction record for an account, with a total of 99 columns, each capable of storing information up to 72 characters.During a typical banking business day, there are multiple users, such as bank staff, automated systems, or customers through an online banking platform, who perform frequent update operations on the 'AccountTransactions' table. These operations could involve updating transaction statuses, modifying transaction amounts, or adding transaction notes.In the given statement, 131 users simultaneously attempt to perform these update operations on the same or adjacent rows in the table. This high concurrency in accessing and modifying the data can result in lock contention. Due to the database's locking mechanism, these concurrent update operations can lead to competition among users for locking the database table. If this contention persists, it can significantly impact the performance of the database, causing delayed or failed transaction requests for other users.Moreover, prolonged locking and contention in the database can also result in increased write latency and affect the overall efficiency of the bank's operations. It may even lead to temporary interruptions in database services and rapid growth in the database transaction log, consuming excessive storage space.\n" + }, + "331": { + "start_time": "1697345103", + "end_time": "1697345209", + "start_timestamp": "2023-10-15 12:45:03", + "end_timestamp": "2023-10-15 12:46:49", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 136\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 7\n \n # Number of rows to insert\n num_rows = 3133084\n \n # Size of each column (in characters)\n column_size = 58\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, a large-scale data cleaning operation is performed on a database table containing 7 columns, 3,133,084 rows, each column size of 58 characters of commodity records. 136 users simultaneously perform a search operation after the data cleaning process, causing an exception in the database.\n", + "desc": "In a file sharing system, let's imagine a database called 'FileShareDB' where users can upload and download files. This database stores metadata about the files, such as file name, size, upload date, download count, and user information. On a typical day, there are many users simultaneously uploading and downloading files. However, over time, outdated and unused files accumulate in the database, occupying unnecessary storage space and possibly impacting the performance of the system. To address this issue, a vacuuming operation needs to be performed. This operation involves deleting and reclaiming storage space for files that are no longer needed or have been marked for deletion. In this particular case, the system administrator runs a script that triggers the vacuuming operation. The script specifies that the vacuuming process should utilize 136 threads for parallel processing. The 'FileShareDB' database contains a table called 'FileMetadata', which contains 3,133,084 rows of data, each representing a file record, with 7 columns, each having a maximum size of 58 characters. The vacuuming operation helps optimize database performance by freeing up storage space and improving the efficiency of queries and file operations in the file sharing system.\n" + }, + "332": { + "start_time": "1697345270", + "end_time": "1697345383", + "start_timestamp": "2023-10-15 12:47:50", + "end_timestamp": "2023-10-15 12:49:43", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 89\n \n # Number of rows to insert\n num_rows = 650796\n \n # Size of each column (in characters)\n column_size = 77\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database system of an online marketplace, there is a situation where redundant indexes are created for certain item attributes such as product name, category, and price range. This leads to additional storage usage and performance overhead. To simulate this scenario, 10 users are querying the database table, which contains 89 columns and 650,796 rows of product records. Each column has a size of 77 characters.\n", + "desc": "In a business intelligence scenario, specifically in the financial sector, there is a database called 'FinancialRecordsDB' that is used to store and analyze financial data for various organizations. This database contains a key table named 'FinancialRecords' that records detailed information about financial transactions and statements. The table has a total of 650,796 rows of data, with each row representing a financial record. The table consists of 89 columns, each with a maximum size of 77 characters. These columns include transaction ID, transaction type, amount, date, department, project code, budget code, financial year, audit status, and more.To optimize the performance of financial analysis queries, the database administrator decides to create redundant indexes on various fields, such as transaction type, date range, department, or project code. These indexes are created to accelerate complex queries, such as budget analysis, income reports, or audits. However, the frequent creation of indexes can lead to additional storage usage and performance overhead in the database. Moreover, this can also result in database fragmentation, which further impacts performance. During peak periods, the generation of financial reports might be delayed, affecting the efficiency of the decision-making process.\n" + }, + "333": { + "start_time": "1697345444", + "end_time": "1697345534", + "start_timestamp": "2023-10-15 12:50:44", + "end_timestamp": "2023-10-15 12:52:14", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a collaborative document editing platform, when multiple users try to upload, download or edit files simultaneously, the file system encounters a high competition for input/output resources. As a result, file transfer becomes slower and less efficient.\n", + "desc": "In a file sharing system scenario, there is a database called 'FileShareDB' that is used by teams or organizations to share files. This database not only stores the files themselves, but also records metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During a typical workday, multiple users may be simultaneously uploading, downloading, or editing files. For example, a project team may be collaborating on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system may be used to store and share large files, such as presentations, video conference recordings, or design drawings. However, due to the high concurrency in file operations, the 'FileShareDB' faces challenges of input/output (I/O) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth can become strained. This I/O contention can result in slower file transfer speeds, especially in situations of limited bandwidth or insufficient server processing capabilities. Moreover, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "334": { + "start_time": "1697345594", + "end_time": "1697345655", + "start_timestamp": "2023-10-15 12:53:14", + "end_timestamp": "2023-10-15 12:54:15", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a finance management system, multiple users are running a query that involves joining large tables together. However, the query is not optimized and the system is experiencing contention issues with CPU resources, leading to poor performance.\n", + "desc": "In a Business Intelligence scenario, let's consider a database called 'CorporateAnalyticsDB', which is used for storing and analyzing various business data for a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights.Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "335": { + "start_time": "1697345715", + "end_time": "1697345855", + "start_timestamp": "2023-10-15 12:55:15", + "end_timestamp": "2023-10-15 12:57:35", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a database of an e-commerce platform, fetching a large amount of data involves executing related subqueries to determine the inventory levels for each product. If these subqueries are not optimized, querying the inventory can become slow and inefficient.\n", + "desc": "In the database of an e-commerce platform, suppose there is a database named 'ECommerceDB', which includes a crucial table named 'ProductInventory' for recording the inventory information of various products. This table might contain inventory data for tens of thousands or even hundreds of thousands of products. The inventory information for each product includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In this database, querying the inventory level of each product may require performing related subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query might first involve selecting all products of a particular category from the 'ProductDetails' table, then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. When the number of products is very large, the performance of these related subqueries can become inefficient. For instance, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. In such cases, due to the need to retrieve inventory information for a large number of products, the database might need to read a significant amount of data from the disk, which could lead to I/O bottlenecks.\n" + }, + "336": { + "start_time": "1697345915", + "end_time": "1697345987", + "start_timestamp": "2023-10-15 12:58:35", + "end_timestamp": "2023-10-15 12:59:47", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 97\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 6\n \n # Number of rows to insert\n num_rows = 67\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, 97 sensors are generating a large amount of data, which needs to be inserted into a database. Each row in the database table has 6 columns, with each column having a size of 56 characters. There are 67 rows in total. This simulation aims to trigger a database exception caused by the simultaneous insertion of the large data set.\n", + "desc": "In the file sharing system scenario, there is a database named 'FileShareDB' that is used for sharing files among users. This database stores both the files themselves and the metadata associated with each file, such as the uploader information, file size, creation date, and access permissions. Multiple users can simultaneously upload, download, or edit files in this system. For example, a team could be working on a collaborative project, where team members are frequently uploading new versions of files, while others download them for viewing or editing. Additionally, large files, such as presentations or videos, can also be stored and shared in this system. However, when multiple users simultaneously upload or download large files, the system's storage and network bandwidth might become overwhelmed. This can result in slower file transfer speeds, especially if there is limited bandwidth or insufficient server processing capacity. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact the overall performance of the system. During peak usage periods, the database might experience locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "337": { + "start_time": "1697346047", + "end_time": "1697346118", + "start_timestamp": "2023-10-15 13:00:47", + "end_timestamp": "2023-10-15 13:01:58", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 97\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 58\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an internet of things (IoT) application, with 97 sensors generating a large amount of data, there is an attempt to simultaneously insert this data into a database table. The table has 20 columns and 68 rows, with each column having a size of 58 characters. This process aims to simulate the database exception caused by inserting a large volume of data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that is specifically designed to store data from various sensors deployed in a smart home environment. This database is responsible for collecting and analyzing sensor data, such as temperature, humidity, motion, light, and pressure. One of the key tables in the database is called 'SensorReadings', which contains data from 68 sensors. Each row in this table represents a reading from a specific sensor, and there are 20 columns that store information about these readings. The columns may include sensor ID, reading type, reading value, timestamp, location, and status.In this specific scenario, there is an anomaly triggered by inserting large amounts of data into the 'SensorReadings' table. This anomaly occurs when 97 threads simultaneously insert large volumes of sensor readings into the database. Due to the lack of proper data partitioning or indexing, the database struggles to handle such a high influx of data. This can lead to performance issues, such as increased write latency and potential database locking. These anomalies can affect the overall efficiency of the smart home system and may cause delays or failures in processing sensor data.\n" + }, + "338": { + "start_time": "1697346178", + "end_time": "1697346239", + "start_timestamp": "2023-10-15 13:02:58", + "end_timestamp": "2023-10-15 13:03:59", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 56\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 67\n \n # Number of rows to insert\n num_rows = 282\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, there is a high contention for accessing and updating the records of 282 products in a table with 67 columns. Each column can store up to 99 characters. This contention is caused by 56 users simultaneously trying to perform update operations on the database.\n", + "desc": "In the scenario of an IoT application, there is a database named 'IoTDataDB', which is used for storing sensor data collected from various IoT devices. This database contains a key table named 'SensorReadings', which records detailed information about the sensor readings. The table consists of 282 rows of data, with each row representing a sensor reading, and a total of 67 columns, each containing information of up to 99 characters. These columns may include sensor ID, sensor type, reading value, timestamp, geographical location, device status, and more. Imagine that in this scenario, 56 devices are simultaneously transmitting sensor readings to the database at a high frequency. However, due to the lack of efficient locking mechanisms or suboptimal database design, concurrent write operations on the database might result in locking contention. This means that when multiple devices attempt to write data to the same or adjacent rows in the 'SensorReadings' table, they might compete for locking the database table, leading to delays and potential conflicts. These locking contentions can affect the responsiveness and efficiency of the database, as well as the reliability of data storage.\n" + }, + "339": { + "start_time": "1697346299", + "end_time": "1697346402", + "start_timestamp": "2023-10-15 13:04:59", + "end_timestamp": "2023-10-15 13:06:42", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 163\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 6\n \n # Number of rows to insert\n num_rows = 3104782\n \n # Size of each column (in characters)\n column_size = 58\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online shopping platform, a large-scale data cleaning operation is performed on a table containing 6 columns and 3,104,782 rows of product records. The database is then subjected to a search operation by 163 users simultaneously. The search includes terms such as product name, category, and price range. However, due to the high number of concurrent searches and the previous data cleaning operation, an exception occurs in the database.\n", + "desc": "In an IoT scenario, there is a database named 'IoTDataDB' that is used to store sensor data collected from various IoT devices. This database contains a table called 'SensorReadings', which stores information about the sensor readings from these devices. The 'SensorReadings' table has 3,104,782 rows of data, with each row representing a reading from a specific sensor. There are 6 columns in this table, including the sensor ID, sensor type, reading value, timestamp, location, and device status. Each column can store up to 58 characters of information. In this scenario, there are 163 threads that are processing and analyzing sensor data concurrently. However, due to the high volume of data and the lack of regular database maintenance operations such as vacuuming, the database performance can be impacted. Without proper vacuuming, the database can become fragmented, leading to inefficient storage allocation and slower query speeds. This can result in delays in processing the sensor data and analyzing it in real-time. Therefore, it is important to periodically perform vacuuming operations on the 'SensorReadings' table to optimize database performance and ensure efficient data storage.\n" + }, + "340": { + "start_time": "1697346462", + "end_time": "1697346576", + "start_timestamp": "2023-10-15 13:07:42", + "end_timestamp": "2023-10-15 13:09:36", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 82\n \n # Number of rows to insert\n num_rows = 583072\n \n # Size of each column (in characters)\n column_size = 51\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database for an e-commerce website, 10 users are performing a query operation on a table containing 82 columns and 583,072 rows. Each column has a size of 51 characters. However, the query involves creating redundant indexes for items such as product name, category, and price range, which can lead to additional storage footprint and performance overhead.\n", + "desc": "In this scenario of an Internet of Things (IoT) application, there is a database named 'IoTDataDB' that is responsible for storing and processing sensor data from various devices. This database consists of multiple tables, one of which is a key table named 'SensorReadings'. This table contains a large volume of data from sensors, such as temperature, humidity, pressure, light, and motion sensors. Each row in the table represents a specific sensor reading, and there are 583,072 rows of data in total. The 'SensorReadings' table has 82 columns, each containing information up to 51 characters, including sensor ID, reading type, reading value, timestamp, location, and other related attributes.In this particular scenario, there might be a situation where the database administrator creates multiple redundant indexes on the 'SensorReadings' table. These indexes are created to accelerate queries that involve filtering or aggregating sensor data based on certain criteria, such as specific sensor types or sensor locations. However, due to the large number of indexes and their overlapping functionalities, these redundant indexes can cause unnecessary storage usage and additional overhead in query execution. This can result in decreased performance and efficiency when executing queries against the 'SensorReadings' table.In an IoT environment, where continuous sensor data is being collected and processed in real-time, it is crucial to ensure efficient and optimized query performance. Reducing redundancies in index creation and considering the specific query requirements can help alleviate performance bottlenecks and maintain smooth operation of the IoT data system.\n" + }, + "341": { + "start_time": "1697346636", + "end_time": "1697346727", + "start_timestamp": "2023-10-15 13:10:36", + "end_timestamp": "2023-10-15 13:12:07", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users are simultaneously uploading, downloading, or editing files, the system is experiencing I/O contention. This means that the file transfer speed is being slowed down due to the competition for input/output resources.\n", + "desc": "In a file sharing system scenario, let's imagine a database named 'TeamFileShareDB' that is used by teams or organizations to share files. This database stores both the files themselves and the metadata associated with them, including details like uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During a typical workday, multiple users may be simultaneously uploading, downloading, or editing files. For example, a team working on an important project might frequently upload new versions of files, while others download them for viewing or editing. Additionally, the system might also be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. With such high concurrency in file operations, the 'TeamFileShareDB' database faces challenges related to I/O contention. When multiple users upload or download large files simultaneously, the system's storage and network bandwidth can become significantly strained. This I/O contention can result in slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact overall database performance. During peak periods, the database may experience locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "342": { + "start_time": "1697346787", + "end_time": "1697346847", + "start_timestamp": "2023-10-15 13:13:07", + "end_timestamp": "2023-10-15 13:14:07", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis scenario, multiple users are performing join operations on a dataset in a database that is experiencing high CPU contention. This leads to poor performance of the join operations, causing delays in data analysis tasks.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' that stores various types of product information. This database includes a table called 'ProductDetails' which contains detailed information about each product, such as the product ID, name, price, stock quantity, brand, category, and more. In this scenario, there is a need to perform complex join queries across multiple tables to generate business reports. These join queries involve joining the 'ProductDetails' table with other tables, such as 'SalesData', 'CustomerProfiles', and 'MarketTrends', to analyze the business data. However, due to the large size of these tables and the complexity of the join operations, the performance of these queries can be slow. This can be caused by factors such as ineffective indexing, suboptimal query optimization, or limited CPU resources. When multiple complex join queries are executed simultaneously, there can be competition for CPU resources, leading to CPU contention and reduced query efficiency.\n" + }, + "343": { + "start_time": "1697346907", + "end_time": "1697347047", + "start_timestamp": "2023-10-15 13:15:07", + "end_timestamp": "2023-10-15 13:17:27", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a database for an e-commerce platform, there is a query that fetches a large amount of data and involves correlated subqueries. This query is used to retrieve the inventory quantity for each product. However, without optimizing the correlated subqueries, the performance of this query may degrade when there is a large number of products in the database.\n", + "desc": "In the database of an e-commerce platform, suppose there is a database named 'ECommerceDB', which includes a crucial table named 'ProductInventory' for recording the inventory information of various products. This table might contain inventory data for tens of thousands or even hundreds of thousands of products. The inventory information for each product includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In this database, querying the inventory level of each product may require performing related subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query might first involve selecting all products of a particular category from the 'ProductDetails' table, then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. When the number of products is very large, the performance of these related subqueries can become inefficient. For instance, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. In such cases, due to the need to retrieve inventory information for a large number of products, the database might need to read a significant amount of data from the disk, which could lead to I/O bottlenecks.\n" + }, + "344": { + "start_time": "1697347108", + "end_time": "1697347179", + "start_timestamp": "2023-10-15 13:18:28", + "end_timestamp": "2023-10-15 13:19:39", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 107\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 60\n \n # Size of each column (in characters)\n column_size = 33\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) application, there is a need to insert a large amount of data from 107 sensors into a database. Each row of data contains 18 columns, with each column having a size of 33 characters. There are a total of 60 rows to be inserted. This simulation aims to test the performance and resilience of the database system when faced with such a large-scale data insertion process.\n", + "desc": "In the IoT scenario, there is a database called 'SensorDataDB' used to collect and analyze data from various sensors. Within this database, there is a table named 'SensorReadings' that stores information from these sensors. This table contains 60 rows of data, each representing a sensor reading, and has 18 columns that store different attributes such as sensor ID, reading type, reading value, timestamp, sensor location, and status. When a large number of sensors start generating data simultaneously, the database may experience issues when it comes to handling these concurrent write requests. This could be due to a lack of effective data partitioning, insufficient buffering mechanisms, or improper indexing. These issues can lead to increased write latency, potential database locking, and ultimately result in anomalies within the database.\n" + }, + "345": { + "start_time": "1697347239", + "end_time": "1697347311", + "start_timestamp": "2023-10-15 13:20:39", + "end_timestamp": "2023-10-15 13:21:51", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 107\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 39\n \n # Number of rows to insert\n num_rows = 69\n \n # Size of each column (in characters)\n column_size = 93\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, where various IoT devices are interconnected, there are 107 devices that generate a large amount of sensor data simultaneously. This data needs to be inserted into the database table, which contains 39 columns with a column size of 93 characters, and there are 69 rows in total. However, the database may experience exceptions due to the high volume of data being inserted at the same time.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'SensorDataDB' that is used to collect and analyze data from various sensors. This database plays a crucial role in monitoring and managing smart home devices. It contains a table called 'SensorReadings' which stores the data collected from 69 sensors. Each row in this table represents a sensor reading and includes information such as sensor ID, reading type (e.g., temperature, humidity, motion), reading value, timestamp, sensor location, and status. Occasionally, all 107 sensors start transmitting data simultaneously, generating a large volume of data that needs to be inserted into the 'SensorReadings' table. However, due to the high number of concurrent insertions and the lack of proper optimization, the performance of the database may be impacted. This can result in slower insertion speeds and increased latency in processing the sensor data. Such performance issues can lead to anomalies in the database and affect the overall functionality of the smart home system.\n" + }, + "346": { + "start_time": "1697347371", + "end_time": "1697347431", + "start_timestamp": "2023-10-15 13:22:51", + "end_timestamp": "2023-10-15 13:23:51", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 195\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 62\n \n # Number of rows to insert\n num_rows = 300\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for an online platform, there is contention for locking the database table when 195 users simultaneously try to perform update operations on a table containing 62 columns and 300 rows of records with a column size of 52 characters. This causes a database exception due to the competition for lock acquisition.\n", + "desc": "In an Internet of Things (IoT) scenario, a database called 'IoTDataDB' is used for collecting and analyzing data from various IoT devices. This database stores data from 300 devices, each device producing a large amount of data. The main table in the database is called 'DeviceData', which contains multiple fields to store the data from these devices, such as device ID, sensor type, sensor value, timestamp, location, and status information. In this scenario, due to the large number of devices and the high frequency of data transmission, the database might encounter performance issues related to locking. When 195 devices simultaneously attempt to transmit data and update the 'DeviceData' table, the database might experience contention for locking resources. This contention occurs because multiple devices are trying to update the same or adjacent rows in the table at the same time. This can lead to delays in data processing and potentially cause the database server to become overloaded. During high traffic periods, such locking contention can result in slower data processing and potential data loss. It can also affect the overall functionality and reliability of the IoT system, as it may impact the real-time analysis and decision-making based on the collected IoT data.\n" + }, + "347": { + "start_time": "1697347491", + "end_time": "1697347542", + "start_timestamp": "2023-10-15 13:24:51", + "end_timestamp": "2023-10-15 13:25:42", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 138\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 3056064\n \n # Size of each column (in characters)\n column_size = 93\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, if there are 138 users searching in the database table containing 18 columns, 3,056,064 rows, each column size of 93 characters, and a vacuum operation is performed on the database table, resulting in an exception due to the search queries being executed while the vacuum operation is in progress.\n", + "desc": "In an e-commerce platform, there is a database called 'ECommerceDB' that stores information about various products. Within the database, there is a table named 'ProductDetails' that contains a large amount of data, specifically 3,056,064 rows. Each row represents a product and includes 18 columns, such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, user rating, number of reviews, sales status, and image link. Due to business needs, such as product updates or clearing out-of-date products, a vacuum operation needs to be performed. The vacuum operation involves reclaiming disk space by physically rearranging the data and indexes in the table, essentially defragmenting the table. If this vacuum operation is not properly optimized or scheduled during low traffic periods, it could cause anomalies in the database. These anomalies could lead to performance issues, such as increased query response time or reduced system throughput. Therefore, it is important to implement appropriate strategies for vacuuming the database table to avoid such anomalies and maintain optimal database performance.\n" + }, + "348": { + "start_time": "1697347602", + "end_time": "1697347717", + "start_timestamp": "2023-10-15 13:26:42", + "end_timestamp": "2023-10-15 13:28:37", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 77\n \n # Number of rows to insert\n num_rows = 660192\n \n # Size of each column (in characters)\n column_size = 62\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database with 77 columns and 660,192 rows, each with a column size of 62 characters, unnecessary indexes are created and then deleted after 6 users perform queries. This process simulates the additional storage and performance overhead caused by the creation and deletion of these indexes.\n", + "desc": "In an e-commerce platform scenario, there is a database called 'OnlineStoreDB' that is used for storing and managing product information. This database contains multiple tables, one of which is a key table named 'ProductDetails'. This table contains a large amount of data, with 660,192 rows representing individual product entries. Each row has a total of 77 columns, with each column capable of storing up to 62 characters. These columns may include information such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and more. In this specific scenario, the database administrator notices that there are redundant indexes in the 'ProductDetails' table. These redundant indexes were created to optimize query performance for various product-related queries, such as searching by brand or category. However, the redundant indexes are not necessary and can lead to additional storage usage and performance overhead. When multiple users simultaneously perform queries on the 'ProductDetails' table, the database might struggle to handle the workload efficiently due to the presence of redundant indexes. This could result in slower query response times, increased storage consumption, and potential impact on the overall performance of the e-commerce platform.\n" + }, + "349": { + "start_time": "1697347777", + "end_time": "1697347868", + "start_timestamp": "2023-10-15 13:29:37", + "end_timestamp": "2023-10-15 13:31:08", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a busy file sharing system, multiple users are uploading, downloading, or editing files concurrently. The system is experiencing contention issues with the input/output (I/O) operations, resulting in slower file transfers.\n", + "desc": "In a file sharing system scenario, imagine a database named 'FileShareDB' that is used for sharing files among teams or organizations. This database stores both the files themselves and metadata related to the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. In this scenario, multiple users could be uploading, downloading, or editing files simultaneously. For example, a project team collaborates to complete an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files like presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the 'FileShareDB' database faces challenges related to input/output (I/O) contention. When multiple users upload or download large files at the same time, the system's storage and network bandwidth might be significantly strained. This I/O contention can lead to slower file transfer speeds, especially when there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact the performance of the database. During peak periods, the database might encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "350": { + "start_time": "1697347928", + "end_time": "1697347988", + "start_timestamp": "2023-10-15 13:32:08", + "end_timestamp": "2023-10-15 13:33:08", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis platform, multiple users are performing join operations on a large dataset using 100% of the available CPU resources. This intense CPU competition is causing poor performance in the join operations, resulting in slow data processing.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB', which stores and analyzes various business data for a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends' that hold detailed business information. Analysts in this scenario frequently need to perform complex join queries across these tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze purchasing behaviors. They might also need to link this data with the 'ProductCatalog' table and 'MarketTrends' table to gain market insights. However, due to the large size of these tables and the complexity of the join operations, these queries can be very slow. If the join keys are not effectively indexed or the queries are not optimized, executing the joins can consume a significant amount of time and resources. Additionally, during peak periods when multiple complex join queries are executed simultaneously, there might be competition for CPU resources, further reducing query efficiency. This CPU contention could occur due to compute-intensive queries running on the database server or insufficient CPU resources to handle the workload.\n" + }, + "351": { + "start_time": "1697348048", + "end_time": "1697348198", + "start_timestamp": "2023-10-15 13:34:08", + "end_timestamp": "2023-10-15 13:36:38", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce website, when querying the inventory for each product, the script will simulate a scenario where a large amount of data needs to be fetched and related subqueries are involved. If the subqueries are not optimized, the performance of the inventory query may be negatively impacted.\n", + "desc": "In an e-commerce environment that requires tracking and managing inventory information, there is a database named 'InventoryDB'. This database contains a table called 'ProductInventory' that records the inventory details for various products. Each row in this table represents a specific product and includes information such as the product ID, current stock level, last update time, supplier ID, and warehouse location. To fulfill various business requirements, such as determining the total inventory of products in a specific category, it is necessary to perform queries that involve correlated subqueries. For example, one might need to retrieve all products within a certain category from the 'ProductDetails' table and then use subqueries on the 'ProductInventory' table to obtain the inventory information for those products. However, when the number of products in a category is large, executing individual subqueries for each product can lead to inefficient performance. This is because retrieving inventory information for a large number of products requires reading a significant amount of data from the disk, which can result in I/O bottlenecks.\n" + }, + "352": { + "start_time": "1697348259", + "end_time": "1697348330", + "start_timestamp": "2023-10-15 13:37:39", + "end_timestamp": "2023-10-15 13:38:50", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 137\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 80\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a real-life scenario of an Internet of Things (IoT) application, 137 sensors are generating a large amount of data simultaneously. This data needs to be inserted into a database table that has 5 columns and 80 rows, with each column having a size of 67 characters. The purpose of running this script is to simulate a database exception that may occur due to the high volume of data being inserted at once.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'IoTDataDB' that is used to store and analyze sensor data from various IoT devices. This database contains a key table called 'SensorData', which records detailed information about sensor readings. This table consists of 80 rows of data, each representing a specific sensor record, with a total of 5 columns, each containing information of up to 67 characters. These columns may include sensor ID, sensor type, reading value, timestamp, and location.When numerous IoT devices simultaneously transmit data to the 'SensorData' table, such as temperature readings, humidity levels, or motion detections, the database might encounter performance issues. Due to the large volume of concurrent write requests and the lack of effective data partitioning or indexing mechanisms, the database's ability to handle these write operations efficiently is limited. This can lead to increased write latency and I/O contention in the database, resulting in anomalies.During peak periods, these anomalies in database performance can negatively impact the accuracy and timeliness of data analysis, hinder real-time monitoring, or cause delays in IoT applications that rely on the data from the 'SensorData' table.\n" + }, + "353": { + "start_time": "1697348390", + "end_time": "1697348462", + "start_timestamp": "2023-10-15 13:39:50", + "end_timestamp": "2023-10-15 13:41:02", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 137\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 63\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 137 devices are simultaneously generating a large amount of data that needs to be inserted into the database. Each device has 20 data points, with each data point having a size of 99 characters. The data generated by these devices needs to be stored in the database efficiently and in a timely manner.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that is used for collecting and analyzing data from various IoT devices. This database is designed to handle a large volume of data from sensors and devices. In this specific case, the database is experiencing an anomaly related to inserting large amounts of data. The script is executed with the 'INSERT_LARGE_DATA' anomaly option and specific parameters, including the use of 137 threads for concurrent insertion operations. The data being inserted consists of 63 rows, with each row having 20 columns, and each column can store data of up to 99 characters. This insertion process can impact the performance of the database, as the large-scale data insertion can increase the write load on the database server and potentially lead to I/O contention or other performance issues. If the database is not properly optimized or lacks appropriate indexing, this anomaly can cause delays or failures in the insertion process, affecting the overall efficiency of the IoT data collection and analysis system.\n" + }, + "354": { + "start_time": "1697348522", + "end_time": "1697348583", + "start_timestamp": "2023-10-15 13:42:02", + "end_timestamp": "2023-10-15 13:43:03", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 107\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 95\n \n # Number of rows to insert\n num_rows = 242\n \n # Size of each column (in characters)\n column_size = 64\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a reservation system, 107 users simultaneously try to update a table with 95 columns and 242 rows of reservation records, each with a column size of 64 characters. Multiple users compete to lock the table for performing the update operation, leading to a database exception.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database system called 'IoTDataDB' that is specifically designed to store and process data from various IoT devices. This database contains a key table named 'DeviceReadings', which records detailed information about the readings from these devices. The 'DeviceReadings' table consists of 242 rows of data, each representing a reading entry from a specific device. This table has 95 columns, each containing information of up to 64 characters. These columns may include device ID, timestamp, sensor type, sensor reading value, location, device status, and other relevant attributes. In this particular scenario, 107 IoT devices are simultaneously sending their readings to the 'DeviceReadings' table at the same time. This high concurrency can result in contention for locking the database table, especially if multiple devices attempt to access or update the same or adjacent rows in the table. When the lock contention lasts for a significant period of time, it can cause performance issues in the database. As a result, other devices may experience delayed processing or failure of their reading requests, potentially impacting the overall functionality and efficiency of the IoT system.Furthermore, if these lock contention incidents occur frequently, it can lead to rapid growth in the database transaction log and increased storage consumption. This may temporarily interrupt the database services and hinder the smooth operation of the IoT system. It is important to implement measures such as optimizing the locking mechanism, improving concurrency control strategies, and tuning the database system to minimize lock contention and ensure reliable and efficient processing of IoT device readings.\n" + }, + "355": { + "start_time": "1697348643", + "end_time": "1697348760", + "start_timestamp": "2023-10-15 13:44:03", + "end_timestamp": "2023-10-15 13:46:00", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 55\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 3955578\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database reservation system, 55 users are simultaneously searching for available rooms in a database table containing 5 columns and 3,955,578 rows. Each column has a maximum size of 99 characters. However, the search operation is causing a database exception due to a large-scale data cleaning operation that was performed prior to the search.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'IoTDataDB' used to store and analyze sensor data from various IoT devices. This database contains a table called 'SensorReadings' which stores detailed sensor readings from these devices. The table consists of 3,955,578 rows of data, with each row representing a specific sensor reading. There are 5 columns in the table, with each column able to store up to 99 characters of information. These columns might include sensor ID, reading type, reading value, timestamp, and sensor location. In this scenario, due to the large volume of sensor data being constantly generated and recorded in the database, there is a need for periodic maintenance to optimize the performance and storage efficiency. The 'VACUUM' command is used in the provided script to perform this maintenance operation. This command cleans up and reclaims the unused or redundant storage space in the database, ensuring efficient storage utilization. The script specifies that the 'VACUUM' command should be executed with 55 threads. This means that the maintenance operation will be performed concurrently using 55 parallel threads, allowing for faster and more efficient processing. By utilizing multiple threads, the 'VACUUM' command can be executed in a distributed manner, effectively leveraging the computing resources available and reducing the time required for the maintenance operation.\n" + }, + "356": { + "start_time": "1697348820", + "end_time": "1697348935", + "start_timestamp": "2023-10-15 13:47:00", + "end_timestamp": "2023-10-15 13:48:55", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 73\n \n # Number of rows to insert\n num_rows = 747606\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database for a social media platform, when optimizing the search function, a large number of indexes are created for user profiles, posts, and comments. However, some of these indexes are redundant and do not significantly improve the search performance. This can lead to unnecessary storage consumption and a potential decrease in query performance.\n", + "desc": "In the scenario of a business intelligence system, the script `python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX --threads 5 --ncolumn 73 --colsize 100 --nrow 747606` is executed. In the background, there is a database named 'CorporateFinanceDB', which is used for storing and processing financial data of large corporations. This database contains multiple tables, one of which is the 'FinancialRecords' table that records various financial transactions and statement information of the company. The table consists of 747,606 rows of data, each representing a financial record, with a total of 73 columns, each containing information of up to 100 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and so on.In the context of this specific script, the database administrator wants to analyze financial data and optimize performance. To achieve this, the administrator decides to create redundant indexes before executing queries. These indexes are created based on different columns, like transaction type, date range, department, or project code, to accelerate complex financial analysis queries.However, the creation of redundant indexes can lead to issues in terms of additional storage usage and performance overhead. It might also cause database fragmentation, which could further impact performance. Furthermore, if frequent index operations are performed, it could result in delayed report generation, eventually affecting the efficiency of the decision-making process in a business intelligence environment.By executing this script with the specified parameters, the impact of redundant index creation on performance can be observed, and potential anomalies related to redundant indexing can be triggered, allowing the administrator to identify and resolve any performance issues in the database.\n" + }, + "357": { + "start_time": "1697348995", + "end_time": "1697349086", + "start_timestamp": "2023-10-15 13:49:55", + "end_timestamp": "2023-10-15 13:51:26", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are simultaneously uploading, downloading, or editing files. This leads to competition for input/output resources, resulting in slower file transfers.\n", + "desc": "In a file sharing system scenario, suppose there is a database called 'FileShareDB', which is used by individuals or organizations to share files. This database not only stores the files themselves but also maintains metadata about the files, such as file size, upload date, last modification date, access permissions, and download counts. On a typical day, multiple users may be simultaneously uploading, downloading, or editing files. For example, a team working on a project may frequently upload new versions of files, while others download them for viewing or editing. Additionally, the system may be used to store and share large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency of file operations, the 'FileShareDB' database faces challenges related to I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may be under significant strain. This I/O contention can result in slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing power. Furthermore, frequent write operations in the database, such as file uploads and updates to metadata, can impact the performance of the database. During peak usage times, the database may experience issues related to locking and transaction management, causing further slowdowns in file processing and metadata updates.\n" + }, + "358": { + "start_time": "1697349146", + "end_time": "1697349206", + "start_timestamp": "2023-10-15 13:52:26", + "end_timestamp": "2023-10-15 13:53:26", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system, when multiple users simultaneously execute a join operation on a large dataset, with each user competing for CPU resources, it can lead to contention and result in poor performance of the join operation.\n", + "desc": "In an IoT scenario, a database named 'SensorDataDB' is used to store data collected from various sensors. This database contains a table called 'SensorReadings', which records information such as temperature, humidity, pressure, light, and motion readings from 100 sensors. There are multiple rows in this table, with each row representing a specific sensor reading and containing various columns for storing the sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, due to the lack of proper indexing or optimization, queries that involve joining multiple tables can result in poor performance. When the database server is running multiple join queries simultaneously, the CPU resources may become contention, causing a decrease in query efficiency. This CPU contention could be a result of the server not having sufficient CPU resources to handle the compute-intensive queries or having too many queries running simultaneously. Thus, it would lead to slower query execution and overall reduced performance of the database in handling join operations.\n" + }, + "359": { + "start_time": "1697349266", + "end_time": "1697349407", + "start_timestamp": "2023-10-15 13:54:26", + "end_timestamp": "2023-10-15 13:56:47", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, when a large amount of inventory needs to be fetched for each product, the performance of the query may be affected because it requires executing related subqueries.\n", + "desc": "In a file sharing system where teams or organizations collaborate and share files, there is a database called 'TeamFileShareDB'. This database stores not only the files themselves but also metadata about the files such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. Multiple users can simultaneously upload, download, or edit files in this system. For example, a project team might be working on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system supports the storage and sharing of large files such as presentations, video conference recordings, or design drawings. However, due to the high concurrency in file operations, the database 'TeamFileShareDB' faces challenges related to fetching large data and executing correlated subqueries. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may be strained, leading to slower file transfer speeds. Moreover, when queries require performing related subqueries to obtain inventory information for a large number of products, the database might need to read a significant amount of data from the disk, resulting in I/O bottlenecks.\n" + }, + "360": { + "start_time": "1697349468", + "end_time": "1697349539", + "start_timestamp": "2023-10-15 13:57:48", + "end_timestamp": "2023-10-15 13:58:59", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 76\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 56\n \n # Size of each column (in characters)\n column_size = 77\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, 76 sensors are generating a large amount of data simultaneously. This data needs to be inserted into a database table with 18 columns, where each column can hold up to 77 characters. There are a total of 56 rows in the table. The purpose of this simulation is to test the database's ability to handle the insertion of a large volume of data, which may result in a performance degradation or database exception if not properly optimized.\n", + "desc": "In an Internet of Things (IoT) scenario, consider a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is named 'SensorReadings' and it contains 56 rows of data, where each row represents sensor readings for a specific event or time period. The 'SensorReadings' table has 18 columns including sensor ID, reading type, reading value, timestamp, sensor location, and status information, with each column having a size of 77 characters. Now, suppose that 76 sensors start transmitting data simultaneously at a very high frequency. This high volume of data being inserted into the database can result in performance issues. If the database is not properly optimized, such as through effective data partitioning, buffering mechanisms, or appropriate indexing, it may face difficulties in efficiently processing these numerous concurrent insert requests. As a result, the database's ability to handle these insert operations may be compromised, leading to anomalies in the database. These anomalies can have a detrimental effect on the overall performance and functionality of the IoT system.\n" + }, + "361": { + "start_time": "1697349599", + "end_time": "1697349670", + "start_timestamp": "2023-10-15 13:59:59", + "end_timestamp": "2023-10-15 14:01:10", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 76\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 37\n \n # Number of rows to insert\n num_rows = 58\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, 76 sensors generate a large amount of data that needs to be inserted into a database simultaneously. This data consists of 37 columns, each with a size of 52 characters, and a total of 58 rows. The goal is to simulate the database exception that can occur during this process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'IoTDataDB' that stores data collected from various IoT devices. This database is designed to handle a large amount of sensor data, which is continuously generated by these devices. The primary table in this database is called 'SensorReadings', which contains information about the sensor ID, sensor type, timestamp, and the reading value. There are 58 rows of data in this table, with each row representing a sensor reading. The table consists of 37 columns, each column containing information with a size of 52 characters. These columns may include sensor ID, sensor location, temperature, humidity, light intensity, motion detection, and other relevant sensor data. In this scenario, there are 76 IoT devices that are constantly generating sensor data and sending it to the 'SensorReadings' table in the 'IoTDataDB' database. However, due to the high volume of data being inserted into the database simultaneously, it might encounter performance issues. This could be because the database is not optimized for handling such large-scale data insertion operations. Without proper optimization, the database's abilities to process these write requests might be limited, resulting in increased write latency and potential anomalies within the database. These anomalies could impact the overall performance of the IoT data collection system and may lead to data inconsistencies or delays in processing the sensor readings.\n" + }, + "362": { + "start_time": "1697349730", + "end_time": "1697349790", + "start_timestamp": "2023-10-15 14:02:10", + "end_timestamp": "2023-10-15 14:03:10", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 107\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 89\n \n # Number of rows to insert\n num_rows = 230\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for managing employee records, multiple users are trying to update the database simultaneously. This creates conflict and contention as the users compete to lock the database table to perform the update operation. The database table contains 89 columns and 230 rows of employee records, with each column having a size of 54 characters. There are a total of 107 users involved in this process. The aim is to simulate the database exception caused by the contention and locking.\n", + "desc": "In an Internet of Things (IoT) scenario, there exists a database called 'SensorDataDB' that is used for collecting and analyzing sensor data. This database is specifically designed to handle a large volume of data from various types of sensors. One of the key tables in this database is 'SensorReadings', which stores information about sensor readings. It consists of 230 rows of data, each representing a different sensor reading, with a total of 89 columns, each containing information of up to 54 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this particular scenario, 107 sensors are generating data simultaneously and there are 107 threads querying and updating the 'SensorReadings' table. Due to the high concurrency of these operations and potential locking on the table, there may be a contention issue leading to performance problems. This contention can result in delayed processing of queries and updates, affecting the overall efficiency of the IoT system. Additionally, if the database is not optimized to handle such high concurrency, it can further aggravate the lock contention issue.\n" + }, + "363": { + "start_time": "1697349851", + "end_time": "1697349885", + "start_timestamp": "2023-10-15 14:04:11", + "end_timestamp": "2023-10-15 14:04:45", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 153\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 3976098\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an e-commerce database with 18 columns and 3,976,098 rows, each with a column size of 52 characters, the VACUUM command is used to clean up unused space and improve database performance. This command is executed by 153 users simultaneously, which may cause an exception or delay in the database.\n", + "desc": "In an e-commerce database, specifically in the database 'OnlineStoreDB', there is a key table named 'ProductDetails' that stores detailed information about various products. This table contains 3,976,098 rows of data, with each row representing a specific product entry. The table consists of 18 columns, each column capable of storing information up to 52 characters in length. These columns may include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, user rating, number of reviews, promotional offers, and product availability status.In this specific situation, a vacuum operation needs to be performed in the database 'OnlineStoreDB'. The vacuum operation is a database maintenance operation used to reclaim storage space and improve database performance. This operation involves analyzing the database's internal data structure, identifying and removing obsolete or unused data, and reorganizing the remaining data to optimize storage.Performing a vacuum operation in an e-commerce database like 'OnlineStoreDB' is important for maintaining database performance and efficiency. The continuous insertion and deletion of product records can lead to database fragmentation and inefficient storage utilization. The vacuum operation helps address these issues by reorganizing the data, removing unnecessary space, and compacting the database.In this particular scenario, the vacuum operation is being performed with 153 threads, indicating the usage of parallelism to speed up the process. The large number of threads suggests that the database has a high volume of data and needs to be optimized to ensure efficient storage and query performance.\n" + }, + "364": { + "start_time": "1697349945", + "end_time": "1697350059", + "start_timestamp": "2023-10-15 14:05:45", + "end_timestamp": "2023-10-15 14:07:39", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 94\n \n # Number of rows to insert\n num_rows = 485731\n \n # Size of each column (in characters)\n column_size = 93\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace database, a situation is simulated where there are redundant indexes created for various product attributes such as name, category, and price range. This causes an additional storage footprint and performance overhead. The simulation involves 5 users searching in the database table, which contains 94 columns and 485,731 rows, with each column having a size of 93 characters.\n", + "desc": "In an IoT scenario, let's imagine a smart home automation system that utilizes a database called 'SmartHomeDB' to store and manage data related to various smart devices in a home. This database contains a key table named 'DeviceData', which records information about the different devices connected to the system. For example, this table might include data such as device ID, device type (e.g. thermostat, security camera, lights), current status (e.g. on/off), temperature, humidity, motion detection status, and other relevant information about the devices. In this scenario, there are 485,731 rows of data in the 'DeviceData' table, each representing a unique device entry. Each row contains a total of 94 columns, with each column allowing for data up to 93 characters in size. These columns store various data related to the smart devices and their functionalities. Suppose that, as part of the smart home automation system, there is a feature that allows users to query and analyze historical data from the 'DeviceData' table for specific devices or time periods. To improve the efficiency of these queries, the database administrator might create multiple indexes on the table, such as indexes based on device type, status, or specific data fields like temperature or motion detection status. However, if the administrator creates redundant indexes or duplicates indexes that serve the same purpose, it can lead to unnecessary overhead and inefficiency in query processing. When users execute queries that involve the 'DeviceData' table, the database may spend unnecessary time and resources to navigate and update these redundant indexes, resulting in slower query performance. In an IoT environment, where real-time data analysis and quick response are crucial, such redundant indexes can impact the overall system performance and user experience.\n" + }, + "365": { + "start_time": "1697350120", + "end_time": "1697350210", + "start_timestamp": "2023-10-15 14:08:40", + "end_timestamp": "2023-10-15 14:10:10", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users upload, download, or edit files simultaneously, the system experiences I/O contention. This leads to a slower file transfer process.\n", + "desc": "In the file transfer system scenario, there is a database called 'TeamFileShareDB' used by teams or organizations to share files. This database stores both the files themselves and metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. Throughout a typical workday, multiple users engage in activities such as uploading, downloading, and editing files. For instance, a project team collaborates on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. The system also handles large file storage and sharing, such as presentations, video conference recordings, or design drawings. Due to the high number of concurrent file operations, the 'TeamFileShareDB' database faces challenges related to I/O (input/output) contention. When multiple users simultaneously upload or download large files, the storage and network bandwidth of the system are significantly strained. This contention in I/O can lead to slower file transfer speeds, especially in situations where bandwidth is limited or server processing capabilities are insufficient. Additionally, frequent write operations in the database, such as file uploads and updates to metadata, can impact database performance. During peak periods, the database may experience issues with locking and transaction management, further slowing down file processing and metadata recording.\n" + }, + "366": { + "start_time": "1697350270", + "end_time": "1697350331", + "start_timestamp": "2023-10-15 14:11:10", + "end_timestamp": "2023-10-15 14:12:11", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a company's database system, there are multiple users simultaneously performing join operations on large datasets using the CPU. This causes contention, leading to poor performance in the database.\n", + "desc": "In a business intelligence scenario, imagine a database named 'CorporateAnalyticsDB', used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights.Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources.During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "367": { + "start_time": "1697350391", + "end_time": "1697350540", + "start_timestamp": "2023-10-15 14:13:11", + "end_timestamp": "2023-10-15 14:15:40", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail database, when fetching a large amount of data, the execution of related subqueries may be needed to retrieve inventory information for each product. If these subqueries are not optimized, it can lead to a deterioration in the performance of retrieving inventory data.\n", + "desc": "In this IoT scenario, let's consider a database called 'SmartHomeDB' that is used to store and analyze sensor data collected from various smart devices in a home. The database contains a table called 'SensorReadings' which records data from different sensors. Each row in the table represents a reading from a specific sensor and contains information such as sensor ID, sensor type (temperature, humidity, motion, etc.), reading value, timestamp, and location.In this particular scenario, the database encounters performance issues when trying to fetch large amounts of data and perform correlated subqueries. For example, when trying to retrieve the temperature readings from all sensors in the living room during a specific time period, the database might need to perform subqueries to gather the relevant information. However, due to the large volume of data and the need to correlate multiple subqueries, the database becomes inefficient in executing these queries. This can lead to slow query execution times and potential bottlenecks in terms of I/O.Overall, the FETCH_LARGE_DATA and CORRELATED_SUBQUERY anomalies in this IoT scenario highlight the challenges faced by the database when processing and retrieving large amounts of sensor data, especially when complex correlated subqueries are involved.\n" + }, + "368": { + "start_time": "1697350600", + "end_time": "1697350672", + "start_timestamp": "2023-10-15 14:16:40", + "end_timestamp": "2023-10-15 14:17:52", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 173\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 17\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, 173 sensors generate a large amount of data that needs to be inserted into a database. This process will simulate the database exception caused by inserting data from 173 sensors, where each sensor generates data with 17 columns, each column having a size of 66 characters, and a total of 68 rows of data.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database called 'IoTDataDB', specifically designed for storing and processing sensor data from various IoT devices. This database contains a key table named 'SensorReadings', which records detailed information about different sensor readings. Each row in this table represents a specific reading from a sensor, and there are a total of 68 rows. The table consists of 17 columns, each containing information of up to 66 characters. These columns may include sensor ID, sensor type (such as temperature, humidity, pressure), reading value, timestamp, device location, and status information.In this scenario, 173 IoT devices are concurrently sending sensor data to the database at a high frequency. Each device generates a large number of readings, resulting in a significant volume of data being inserted into the 'SensorReadings' table. However, due to the lack of proper data insertion optimization techniques or inefficient indexing, the database's ability to handle such a high number of concurrent insert operations might be limited. This can lead to performance issues in the database, such as increased write latency or even database locking. These anomalies can impact the real-time processing and analysis of the sensor data, potentially affecting the overall performance and functionality of the IoT system.\n" + }, + "369": { + "start_time": "1697350733", + "end_time": "1697350805", + "start_timestamp": "2023-10-15 14:18:53", + "end_timestamp": "2023-10-15 14:20:05", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 173\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 31\n \n # Number of rows to insert\n num_rows = 70\n \n # Size of each column (in characters)\n column_size = 77\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a system where multiple users are simultaneously inserting large amounts of data into a database, the script is simulating the scenario where 173 users are inserting data. Each user is inserting data into a table with 31 columns, where each column can hold up to 77 characters. The table has a total of 70 rows. This script aims to trigger a database exception caused by the high volume of simultaneous data insertions.\n", + "desc": "In the business intelligence scenario, there is a database 'BIAnalyticsDB' used for storing and analyzing business data of a company. This database contains multiple tables including 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. The 'SalesData' table records sales records, the 'CustomerProfiles' table contains customer information, the 'ProductCatalog' table stores product details, and the 'MarketTrends' table tracks market trends. When executing complex insert operations in this scenario, such as inserting a large amount of data into the 'SalesData' table, it is important to consider the impact on the database's performance. In this example, 173 concurrent insert operations are performed, where each insert operation involves inserting a row of data into the 'SalesData' table. Each row contains 31 columns with a size of 77 characters. These columns may include information such as transaction ID, customer ID, product ID, quantity, price, and date. This high volume of concurrent inserts can put significant pressure on the database's write capacity and may cause performance issues such as increased write latency or database locking. It is important to properly optimize the database and consider measures such as batching the inserts or optimizing disk write operations to ensure efficient data insertion without triggering anomalies.\n" + }, + "370": { + "start_time": "1697350865", + "end_time": "1697350925", + "start_timestamp": "2023-10-15 14:21:05", + "end_timestamp": "2023-10-15 14:22:05", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 181\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 82\n \n # Number of rows to insert\n num_rows = 375\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for an online store, there are 181 users simultaneously competing for locks to perform frequent update operations on a database table containing 82 columns and 375 rows of product records. Each column has a size of 50 characters. This simulates a scenario where multiple users are trying to update the same database simultaneously, leading to lock contention and potential database exceptions.\n", + "desc": "In a bank scenario, let's consider a database named 'BankingDB' which manages customer and transaction data. Within this database, there is a key table called 'AccountTransactions' that stores detailed information about different banking transactions. This table consists of 375 rows, with each row representing a transaction record for an account. It has a total of 82 columns, each of which can contain up to 50 characters. These columns include transaction ID, account number, transaction type (e.g., deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and moreDuring busy banking hours, multiple users (such as bank staff, automated systems, or customers through an online banking platform) may simultaneously attempt frequent update operations on the 'AccountTransactions' table. These operations could involve updating transaction statuses, modifying transaction amounts, or adding transaction notes. In a specific instance, 181 users attempt to perform these update operations simultaneously, and this high concurrency can lead to contention for locking the database table.Due to the table's design and the database's locking mechanism, competition for locks can occur among the users, resulting in prolonged locking of the table. This locking contention can negatively impact database performance, causing delayed processing or failure of other users' transaction requests. This, in turn, affects the smooth functioning of daily banking operations. Additionally, if such incidents occur frequently, they can lead to rapid growth in the database transaction log, consuming excessive storage space, and even causing temporary interruptions in database services.\n" + }, + "371": { + "start_time": "1697350985", + "end_time": "1697351024", + "start_timestamp": "2023-10-15 14:23:05", + "end_timestamp": "2023-10-15 14:23:44", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 98\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 17\n \n # Number of rows to insert\n num_rows = 3560139\n \n # Size of each column (in characters)\n column_size = 77\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for an online marketplace, if 98 users simultaneously search for products using terms like product name, category, and price range, after performing a large-scale data cleaning operation on a table containing 17 columns and 3,560,139 rows, each column consisting of 77 characters of product records, it may lead to a database exception.\n", + "desc": "In the database of an e-commerce platform, suppose there is a database named 'ECommerceDB'. This database stores information related to various products and their details. In one of the tables called 'ProductRecords', there are 3,560,139 rows of data, each representing a specific product entry. This table consists of 17 columns, each containing information of up to 77 characters. These columns could include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, and sales status. Under certain circumstances, such as updating the product inventory or removing outdated products, the database administrator needs to perform a process called 'VACUUM' to clean up the unnecessary data. If this process is not optimized or scheduled properly, it can impact the performance of the database. This can result in slower response times for user queries or other operations, potentially adversely affecting the e-commerce platform's functionality and the user experience.\n" + }, + "372": { + "start_time": "1697351084", + "end_time": "1697351199", + "start_timestamp": "2023-10-15 14:24:44", + "end_timestamp": "2023-10-15 14:26:39", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 63\n \n # Number of rows to insert\n num_rows = 674052\n \n # Size of each column (in characters)\n column_size = 58\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce platform's database, a large number of indexes are created for items such as product name, category, and price range at the beginning of the query, followed by a query of 10 users. The database table contains 63 columns and 674,052 rows of records, with each column having a size of 58 characters. This simulates the additional storage footprint and performance overhead caused by creating redundant indexes.\n", + "desc": "In a business intelligence scenario, particularly involving the financial data analysis of large corporations, there is a database called 'CorporateFinanceDB'. This database stores and processes financial information for these companies. Within this database, there are multiple tables, one of which is named 'FinancialRecords'. This table contains a total of 674,052 rows of financial data, with each row representing a specific financial record. In this table, there are 63 columns, each capable of storing up to 58 characters of information. These columns may include transaction ID, transaction type (such as income, expenditure, assets, or liabilities), transaction amount, date, department, project code, budget code, financial year, audit status, and more. In a typical business intelligence analysis process, the database administrator frequently creates redundant indexes in order to speed up queries related to financial analysis, such as departmental budget analysis, quarterly income reports, or annual audits. These indexes can be based on various factors, such as transaction type, date range, department, or project code. However, the creation of numerous redundant indexes can lead to increased storage usage and performance overhead in the database. Additionally, frequent index operations might cause database fragmentation, further impacting performance. In a business intelligence environment, these issues could result in delayed generation of reports, thus affecting the decision-making process.\n" + }, + "373": { + "start_time": "1697351259", + "end_time": "1697351350", + "start_timestamp": "2023-10-15 14:27:39", + "end_timestamp": "2023-10-15 14:29:10", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing and accessing files simultaneously, there is a high volume of data being uploaded, downloaded, and edited. This creates contention for the I/O resources of the system, causing slower file transfers.\n", + "desc": "In a file sharing system scenario, there is a database called 'TeamFileShareDB' used by teams or organizations for sharing files. This database not only stores the files themselves but also records metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During regular usage, multiple users may simultaneously upload, download, or edit files, with some users constantly uploading the latest versions of files and others downloading them for viewing or editing. Additionally, the system might be used for storing and sharing large files like presentations, video conference recordings, or design drawings. The high concurrency in file operations leads to a challenge known as I/O contention, meaning that when multiple users are simultaneously uploading or downloading large files, the system's storage and network bandwidth may become strained. This contention can result in slower file transfer speeds, especially in scenarios with limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak usage periods, the database might encounter locking and transaction management issues, which further slow down file processing and metadata recording.\n" + }, + "374": { + "start_time": "1697351410", + "end_time": "1697351470", + "start_timestamp": "2023-10-15 14:30:10", + "end_timestamp": "2023-10-15 14:31:10", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics company, there is a process where multiple tables are joined together to extract meaningful insights from the data. However, if the join performance is poor and there is CPU contention, it can lead to slow query execution and high CPU usage, impacting the overall performance and efficiency of the data analytics tasks.\n", + "desc": "In a Business Intelligence (BI) scenario, suppose there is a database named 'CorporateAnalyticsDB', which stores and analyzes various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. These tables are filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables in order to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. Additionally, they might also need to link these data with the 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) tables to gain deeper market insights.However, due to various factors such as ineffective indexing of join keys or improper query optimization, the performance of these join queries can be slow. Executing these join operations can require a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there can be competition for CPU resources. This can further reduce the efficiency of the queries, resulting in poor join performance and CPU contention. CPU contention can occur when there are too many compute-intensive queries running on the database server or when the server's CPU resources are insufficient to handle these queries.\n" + }, + "375": { + "start_time": "1697351530", + "end_time": "1697351680", + "start_timestamp": "2023-10-15 14:32:10", + "end_timestamp": "2023-10-15 14:34:40", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a database for an e-commerce platform, when retrieving a large amount of data and executing related subqueries to find inventory information for each product, there may be a performance degradation if the subqueries are not optimized.\n", + "desc": "In an e-commerce scenario, there is a database called 'ECommerceDB' that stores information about various products. Within this database, there is a table named 'ProductInventory' which contains inventory data for tens of thousands or even hundreds of thousands of products. This inventory data includes details such as the product ID, current stock level, last inventory update time, supplier ID, and warehouse location. In this particular scenario, the database encounters a performance issue when conducting queries that involve retrieving inventory information for a large number of products. These queries often require related subqueries to obtain the necessary inventory data. However, when dealing with a large number of products, executing individual subqueries for each product becomes time-consuming and inefficient. This problem is further exacerbated by the need to read a significant amount of data from the disk, leading to I/O bottlenecks.\n" + }, + "376": { + "start_time": "1697351740", + "end_time": "1697351812", + "start_timestamp": "2023-10-15 14:35:40", + "end_timestamp": "2023-10-15 14:36:52", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 130\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 92\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a sensor data monitoring system, a large amount of data generated by 130 sensors needs to be inserted into the database simultaneously. This can result in a database exception due to the high volume of data being inserted. The database table contains 18 columns, each with a column size of 63 characters, and there are 92 rows of data.\n", + "desc": "In a file sharing system scenario, let's imagine there is a database named 'FileShareDB' that is used by teams or organizations to share files. This database stores not only the files themselves but also the metadata associated with those files, such as uploader information, size, creation and modification timestamps, version history, access permissions, and download counts. On a typical day, multiple users might be simultaneously uploading, downloading, or editing files. For example, a project team could be collaborating on an important report, with team members frequently uploading the latest versions of files while others download them for viewing or editing. Additionally, the system may be used to store and share large files, like presentations, video conference recordings, or design drawings. With such high concurrency in file operations, the 'FileShareDB' database faces a challenge when dealing with large-scale data insertion operations. When multiple users simultaneously perform large-scale data insertions, such as uploading a large number of files or adding extensive metadata, it puts a significant strain on the database's processing power and storage capacity. This can lead to performance issues, including slower file upload speeds, increased response times, and potential database locking. Therefore, such large-scale data insertions can potentially trigger anomalies and affect the overall efficiency of the file sharing system.\n" + }, + "377": { + "start_time": "1697351872", + "end_time": "1697351944", + "start_timestamp": "2023-10-15 14:37:52", + "end_timestamp": "2023-10-15 14:39:04", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 130\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 33\n \n # Number of rows to insert\n num_rows = 71\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, a large amount of data generated by 130 sensors needs to be inserted into the database simultaneously. This process may cause a database exception. The database table contains 33 columns and 71 rows of data, with each column size being 70 characters.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database used for collecting and analyzing data from various sensors. This database, called 'SensorDataDB', is designed to handle a large volume of sensor data. Within this database, there is a table named 'SensorReadings' which stores the data collected from the sensors. Each row in this table represents a reading from a sensor and contains information such as sensor ID, type of reading (e.g., temperature, humidity, pressure), value of the reading, timestamp, sensor location, and status. In this particular situation, 130 sensors are simultaneously transmitting data at a high frequency. The 'SensorReadings' table needs to handle the influx of data from these sensors. However, due to various factors such as insufficient buffering mechanisms, lack of proper indexing, or absence of data partitioning techniques, the database may encounter performance issues. These issues can manifest as increased write latency, database locking, or overall inefficiency in processing the large number of concurrent write requests. Such anomalies can impact the functionality and reliability of the system, potentially leading to delays in data processing or failures in sensor data storage.\n" + }, + "378": { + "start_time": "1697352004", + "end_time": "1697352064", + "start_timestamp": "2023-10-15 14:40:04", + "end_timestamp": "2023-10-15 14:41:04", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 122\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 78\n \n # Number of rows to insert\n num_rows = 295\n \n # Size of each column (in characters)\n column_size = 76\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, 122 users are simultaneously attempting to update an inventory table with 78 columns and 295 rows, each containing data with a column size of 76 characters. These users are competing with each other to lock the table and perform the update operation. This scenario simulates a database exception caused by contention for the same resource.\n", + "desc": "In an Internet of Things (IoT) scenario, a database named 'SensorDataDB' is utilized to collect and analyze sensor data. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which stores data from the sensors. Each row in this table represents a reading from a sensor, and there are a total of 295 rows of data. The table contains 78 columns, each capable of storing data up to 76 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this specific case, there are 122 threads concurrently attempting to access and modify the data in the 'SensorReadings' table. Due to the high level of concurrency, there is a possibility of lock contention occurring. Lock contention happens when multiple threads compete to acquire locks on the database, causing delays and potential performance issues. If the locks are held for an extended period of time, it can disrupt the processing of sensor data, affect the accuracy of analysis results, and potentially impact the overall functionality of the IoT system.\n" + }, + "379": { + "start_time": "1697352124", + "end_time": "1697352167", + "start_timestamp": "2023-10-15 14:42:04", + "end_timestamp": "2023-10-15 14:42:47", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 106\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 18\n \n # Number of rows to insert\n num_rows = 2705074\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online store, there is a need for frequent data cleaning operations to optimize the system's performance. This script simulates a vacuum operation on a database table containing 18 columns and approximately 2,705,074 rows of product records. The search queries in this scenario are executed by 106 users simultaneously, each searching through columns with a column size of 70 characters. The purpose of this simulation is to trigger a database exception caused by the vacuum operation and the high number of concurrent search queries.\n", + "desc": "In the database of an e-commerce platform, specifically designed for an online store, called 'OnlineStoreDB', there is a table named 'ProductRecords' which stores detailed information about products. This table contains a total of 2,705,074 rows of data, with each row representing a specific product entry. The table consists of 18 columns, with each column able to hold up to 70 characters of information. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other relevant attributes.There is a need to optimize the database by performing a VACUUM operation. VACUUM is a database maintenance operation that removes dead rows from tables and frees up storage space by reclaiming unused space. In this case, the administrator wants to reclaim space by deleting unnecessary or obsolete product records from the 'ProductRecords' table. The deletion process involves removing a large number of rows, possibly in the range of millions. Performing such a large-scale deletion operation without proper pre-processing or optimization measures can impact the database's performance. It could lead to increased processing time, decreased system responsiveness, and the potential for anomalies to occur. When the operation is executed with 106 threads, it further intensifies the load on the database, which could result in degraded performance, delays in other database operations, and increased resource contention. Therefore, it is crucial to carefully plan and execute the VACUUM operation, considering factors such as batching, performing the operation during low traffic periods, and implementing optimization techniques to minimize any potential anomalies that may arise.\n" + }, + "380": { + "start_time": "1697352227", + "end_time": "1697352342", + "start_timestamp": "2023-10-15 14:43:47", + "end_timestamp": "2023-10-15 14:45:42", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 7\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 64\n \n # Number of rows to insert\n num_rows = 505860\n \n # Size of each column (in characters)\n column_size = 80\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database with 64 columns and 505,860 rows, each column size is 80 characters, a large number of indexes are created at the beginning of the query for items such as product name, category, and price range. Seven users execute queries, and after the query operation, these indexes are deleted. This simulates the additional storage footprint and performance overhead caused by this process.\n", + "desc": "In the business intelligence scenario, there is a database named 'BusinessIntelDB' that is used to store and analyze various types of business data. Within this database, there is a table called 'BusinessRecords' that contains a large number of rows and columns, with a total of 505,860 rows. Each row represents a specific business record, with 64 columns that store information related to the business, such as transaction details, financial data, customer information, and more. In order to perform efficient data analysis and generate meaningful insights, the administrators of the database often create multiple indexes on certain columns of the 'BusinessRecords' table. These indexes are created to accelerate complex queries, enhance the speed of data retrieval, and optimize the performance of the database. However, in this specific scenario, due to an excessive number of indexes being created, there is a redundant index problem. The redundant indexes in the 'BusinessRecords' table lead to increased storage usage and additional performance overhead. Moreover, frequent index operations can result in database fragmentation, which further affects the performance of the database. When multiple users simultaneously execute complex queries on the 'BusinessRecords' table during busy times, the presence of redundant indexes can cause delayed report generation and impact the efficiency of decision-making processes. Specifically, in this scenario, with 7 users executing queries, thread contention might occur due to the excessive index operations, resulting in degraded database performance.\n" + }, + "381": { + "start_time": "1697352402", + "end_time": "1697352493", + "start_timestamp": "2023-10-15 14:46:42", + "end_timestamp": "2023-10-15 14:48:13", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a collaborative document editing platform, multiple users are uploading, downloading, or editing files simultaneously, resulting in competition for input/output resources. This leads to a slowdown in file transfer operations.\n", + "desc": "In the file transfer system scenario, imagine a database named 'FileShareDB', which is used by teams or organizations for sharing files. This database not only stores the files themselves but also records metadata such as file size, uploader information, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users might be simultaneously uploading, downloading, or editing files. The system might also be used for storing and sharing large files such as presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the 'FileShareDB' database faces challenges in terms of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth might be significantly strained. This I/O contention can lead to slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing capabilities. Additionally, frequent write operations in the database, such as file uploads and metadata updates, can also impact database performance. During peak periods, the database might encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "382": { + "start_time": "1697352553", + "end_time": "1697352613", + "start_timestamp": "2023-10-15 14:49:13", + "end_timestamp": "2023-10-15 14:50:13", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analysis task, multiple users are performing a join operation on a large dataset using a script. The join operation is not optimized, resulting in poor performance. Additionally, there is contention for CPU resources, leading to further degradation of the script's performance.\n", + "desc": "In an IoT scenario, there is a database called 'SmartHomeDB' that stores information related to various smart home devices. One key table in this database is named 'DeviceData', which contains data from different types of sensors and devices installed in a smart home. The table consists of multiple columns, including device ID, device type (such as motion sensor, temperature sensor, camera, etc.), device status, timestamp, sensor readings (such as temperature, humidity, light intensity), and more. During the normal operation of a smart home, multiple devices continuously send sensor data to the 'DeviceData' table. For instance, a motion sensor might regularly send occupancy data, and a temperature sensor might frequently send temperature readings. Due to the high frequency of data being written to the table, the database might encounter performance issues. This could be due to the inefficiency of the indexing strategy used for handling write operations, or the lack of optimized query execution plans for the queries involving sensor data analysis.As a result, poor join performance might occur when the database needs to execute complex join queries to analyze relationships and patterns between different types of sensor data. For example, a query could involve joining the 'DeviceData' table with other tables to find correlations between temperature readings and occupancy data. Due to the large volume of data and the complexity of the join operations, these queries might take a long time to execute, impacting the overall performance of the database.Moreover, CPU contention can also occur in this scenario. When multiple complex join queries are running concurrently, they might compete for CPU resources. If the database server's CPU resources are insufficient to handle these resource-intensive queries, the overall query performance can be significantly affected. This contention can lead to longer query execution times, delays in processing real-time data, and reduced responsiveness in smart home applications.\n" + }, + "383": { + "start_time": "1697352673", + "end_time": "1697352823", + "start_timestamp": "2023-10-15 14:51:13", + "end_timestamp": "2023-10-15 14:53:43", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail database, there is a need to fetch large amounts of data, specifically the inventory for each product. This requires the execution of correlated subqueries. If these subqueries are not properly optimized, the performance of querying inventory may deteriorate.\n", + "desc": "In an e-commerce scenario, suppose there is a database named 'ECommerceDB' used for storing and managing various product information. One of the key tables in this database is called 'ProductDetails', which contains detailed information about each product, such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, and more. In this scenario, due to the need to analyze and generate reports on product inventory, the database might encounter performance issues when executing queries that involve fetching a large amount of data and performing correlated subqueries. For example, if a user wants to determine the total inventory of all products within a specific category, the database might need to retrieve a significant amount of data from the 'ProductDetails' table and perform subqueries to obtain inventory information for each product in that category. This can lead to slow query execution and potential I/O bottlenecks, especially when dealing with a large number of products or categories containing a large number of products.\n" + }, + "384": { + "start_time": "1697352883", + "end_time": "1697352955", + "start_timestamp": "2023-10-15 14:54:43", + "end_timestamp": "2023-10-15 14:55:55", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 157\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 98\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial reporting system, 157 users simultaneously submit a large amount of data for processing. Each data entry contains 10 columns with a column size of 78 characters, and there are a total of 98 data entries. The system is overloaded and experiences a slowdown due to the high number of data submissions.\n", + "desc": "In the context of a file sharing system, let's say there is a database called 'FileShareDB' which is used for sharing files among users. This database stores not only the files themselves but also metadata related to the files, such as uploader information, file size, creation date, modification date, access permissions, and download counts. On a typical day, there are multiple users who are uploading, downloading, or editing files simultaneously. For example, a team working on a project might be constantly uploading new versions of files while others in the team download and edit those files. The system is also used for storing and sharing large files such as presentations, video conference recordings, or design drawings. In such a high concurrency scenario, when multiple users are uploading or downloading large files at the same time, it can result in a contention for input/output (I/O) resources. This contention can manifest as slower file transfer speeds, especially if there is limited bandwidth or insufficient server processing capability. Additionally, frequent write operations, such as file uploads and metadata updates, can impact the performance of the database. During peak periods, the database might experience issues with locking and transaction management, resulting in further slowdowns in file processing and metadata recording.\n" + }, + "385": { + "start_time": "1697353015", + "end_time": "1697353087", + "start_timestamp": "2023-10-15 14:56:55", + "end_timestamp": "2023-10-15 14:58:07", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 157\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 39\n \n # Number of rows to insert\n num_rows = 97\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a manufacturing database, 157 machines simultaneously generate a large amount of data that needs to be inserted into the database. Each machine generates data for 39 columns, with each column size being 52 characters, and a total of 97 rows of data. This process may cause an exception in the database due to the high volume of data being inserted simultaneously.\n", + "desc": "In an internet of things (IoT) scenario, there is a database specifically used for collecting and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from multiple sensors. These fields may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, sensor location, and status information.\n" + }, + "386": { + "start_time": "1697353147", + "end_time": "1697353208", + "start_timestamp": "2023-10-15 14:59:07", + "end_timestamp": "2023-10-15 15:00:08", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 89\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 78\n \n # Number of rows to insert\n num_rows = 319\n \n # Size of each column (in characters)\n column_size = 99\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 89 users simultaneously attempt to perform a frequent update operation in a database table containing 78 columns and 319 rows of product records, each with a column size of 99 characters. These users compete with each other to lock the database table to perform the update operation.\n", + "desc": "At a specific time, 157 sensors start transmitting data simultaneously to the database. Each sensor produces data with 39 different columns, and each column can contain data of up to 52 characters. These columns may include sensor ID, sensor type, sensor value, timestamp, location, and other relevant information. However, due to the large volume of data being inserted at once and potentially inadequate data insertion mechanisms, such as buffering or batching, the database might not be able to efficiently handle this influx of data. This could result in latency in the data insertion operation and may also lead to anomalies in the database.\n" + }, + "387": { + "start_time": "1697353268", + "end_time": "1697353353", + "start_timestamp": "2023-10-15 15:01:08", + "end_timestamp": "2023-10-15 15:02:33", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 153\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 3276695\n \n # Size of each column (in characters)\n column_size = 52\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by a popular social media platform, when 153 users simultaneously perform a search after a data cleaning operation on a database table with 10 columns and 3,276,695 rows, each with a column size of 52 characters, an exception is generated due to high search load and insufficient optimization.\n", + "desc": "In a banking scenario, there is a database called 'BankingDB' that stores customer and transaction data for a bank. The database contains a key table named 'AccountTransactions' that records detailed information about various banking transactions. This table consists of 319 rows of data, each representing a transaction record for an account, with 78 columns, each containing information of up to 99 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more. In this particular scenario, 89 users are simultaneously attempting to access and update the 'AccountTransactions' table. Due to the high concurrency and simultaneous access, there is contention for locking the database table, causing performance issues. This may result in delayed processing or failure of transaction requests from the users, impacting the daily operations of the bank. Additionally, if such incidents occur frequently, it could lead to database transaction log growth and temporary interruptions in database services.Assume we have a database in the e-commerce domain called 'ECommerceDB', which is responsible for storing and managing product data. Within this database, there is a table named 'ProductDetails' that contains important information about various products. This table consists of a large number of rows (specifically, 3,276,695 rows) with a total of 10 columns. Each column can hold up to 52 characters and contains data such as product ID, name, price, quantity, brand, category, size, color, weight, and description.In this particular scenario, a vacuum operation needs to be performed on the 'ProductDetails' table. Vacuuming is a process commonly used to reclaim unused space and optimize database performance by reorganizing data storage. This operation involves scanning and rewriting the table data to remove deleted or obsolete rows, rearranging the remaining data, and reclaiming unused storage space.To execute this vacuum operation efficiently, 153 threads will be utilized for parallel processing. Each thread is responsible for handling a portion of the data, ensuring that the process is completed in a timely manner. This parallel processing approach helps to minimize the overall execution time and optimize resource utilization.By performing the vacuum operation, the database can eliminate obsolete data, free up storage space, and improve query performance and overall database efficiency. This helps to maintain the smooth operation of the e-commerce platform by ensuring that the database is in a clean and optimized state.\n" + }, + "388": { + "start_time": "1697353413", + "end_time": "1697353528", + "start_timestamp": "2023-10-15 15:03:33", + "end_timestamp": "2023-10-15 15:05:28", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 87\n \n # Number of rows to insert\n num_rows = 642046\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace database, 9 users are performing a query operation on a table containing 87 columns and 642,046 rows of product records, each column with a size of 63 characters. Redundant indexes are created before the query operation, which may result in additional storage usage and performance overhead.\n", + "desc": "In an IoT scenario, imagine a database called 'SensorDataDB', used specifically for collecting and analyzing data from various types of sensors. This database records information from a large number of sensors and utilizes a table called 'SensorReadings' to store this data. The 'SensorReadings' table contains 642,046 rows of data, with each row representing a reading from a sensor. This table consists of 87 columns, each with a maximum size of 63 characters. These columns may include sensor ID, reading type, reading value, timestamp, location, and other relevant information. In this particular scenario, the database encounters redundancy in index creation. This redundancy arises from the need to accelerate queries for sensor data analysis, such as trend analysis or anomaly detection. Due to the frequent creation of indexes before queries and their subsequent deletion, the database experiences increased storage usage and performance overhead. This can lead to decreased efficiency in generating analysis reports, potentially affecting the overall effectiveness of the IoT system.\n" + }, + "389": { + "start_time": "1697353588", + "end_time": "1697353678", + "start_timestamp": "2023-10-15 15:06:28", + "end_timestamp": "2023-10-15 15:07:58", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a busy file sharing system, multiple users are simultaneously uploading, downloading, or editing files, causing a competition for input/output (I/O) operations. This leads to slower file transfers as the file system struggles to handle the increased workload.\n", + "desc": "In an online file sharing system used by teams or organizations, there is a database called 'TeamFileShareDB' that stores both the files themselves and their metadata, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. This system experiences a high level of concurrency, with multiple users simultaneously uploading, downloading, and editing files throughout the day. This includes large files such as presentations, video conference recordings, and design drawings. Due to the high volume of file operations, the system encounters challenges related to input/output (I/O) contention. When multiple users are uploading or downloading large files at the same time, the system's storage and network bandwidth become strained. This leads to slower file transfer speeds, particularly when there is limited bandwidth or insufficient server processing capabilities. Additionally, frequent write operations in the database, such as file uploads and updates to file metadata, can impact the overall performance of the database. During peak periods, the database may experience locking and transaction management issues, further slowing down file processing and the recording of metadata.\n" + }, + "390": { + "start_time": "1697353738", + "end_time": "1697353799", + "start_timestamp": "2023-10-15 15:08:58", + "end_timestamp": "2023-10-15 15:09:59", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a computer system where multiple tasks are running simultaneously, there is a performance issue with joining tables in a database query. This is causing the CPU to become overloaded, leading to contention and slowing down the overall system performance.\n", + "desc": "In a business intelligence scenario, we can imagine a database named 'CorporateAnalyticsDB' that stores and analyzes various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this scenario, there is a need for complex join queries across multiple tables to generate comprehensive business reports. These queries involve joining tables like 'SalesData' with 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends' to analyze purchasing behaviors and gain market insights. However, due to the large size of the tables, the lack of effective indexing on join keys, or poor query optimization, executing these join queries can be slow. Additionally, during peak periods when multiple complex join queries are executed simultaneously, there may be competition for CPU resources, further impacting query efficiency. This CPU contention could occur when there are too many compute-intensive queries running on the database server or if the server's CPU resources are insufficient.\n" + }, + "391": { + "start_time": "1697353859", + "end_time": "1697354008", + "start_timestamp": "2023-10-15 15:10:59", + "end_timestamp": "2023-10-15 15:13:28", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail platform, when trying to retrieve a large amount of data related to inventory, such as the quantity of each product available, the database may encounter performance issues if the query involves correlated subqueries that are not optimized.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database called 'SmartHomeDB', which is used to store and analyze data from various smart home devices. This database contains a table named 'DeviceData', which records the data generated by different devices in a smart home system. The table consists of a large number of rows and columns, representing different types of devices (e.g., thermostats, security cameras, motion sensors, etc.) and their corresponding data (e.g., temperature readings, video recordings, motion detection events, etc.). In this scenario, the database is used to monitor and analyze the data generated by these smart home devices. For example, a user might want to fetch data from the 'DeviceData' table to understand the energy consumption patterns of different devices in their home. This could involve performing complex queries that require joining multiple tables and performing correlated subqueries, such as retrieving the electricity usage data of all devices within a specific time range and comparing it with historical data.However, due to the large volume of data and the complexity of these queries, fetching large amounts of data and executing correlated subqueries can be time-consuming and resource-intensive. This can result in slow query performance and increased processing time. In such cases, the database might need to read a significant amount of data from the disk, leading to I/O bottlenecks. Additionally, if the join keys in the tables are not effectively indexed or if the queries are not properly optimized, the performance of these queries can be further impacted, making it even more challenging to fetch large amounts of data and perform correlated subqueries efficiently.\n" + }, + "392": { + "start_time": "1697354069", + "end_time": "1697354141", + "start_timestamp": "2023-10-15 15:14:29", + "end_timestamp": "2023-10-15 15:15:41", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 168\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 67\n \n # Size of each column (in characters)\n column_size = 45\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a manufacturing facility, 168 machines are generating a large amount of data that needs to be inserted simultaneously into a database. Each machine has 9 sensors, and each sensor generates data of 45 characters. There are a total of 67 machine instances. This script simulates the database exception that may occur during this data insertion process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database named 'SensorDataDB', which is used for collecting and analyzing sensor data from various devices. This database contains a primary table called 'SensorReadings', which stores data from 67 sensors. The 'SensorReadings' table has 9 columns, each with a size of 45 characters. These columns may include sensor ID, sensor type, reading value, timestamp, location, and status information. In this scenario, the database is experiencing performance issues when a large amount of data is being inserted into the 'SensorReadings' table. The performance degradation is due to the lack of proper indexing or data partitioning, as well as the high number of concurrent insertion operations. The database is not able to efficiently handle the large volume of data being inserted, leading to anomalies in the system. These anomalies may result in increased write latency, database locking, or even failures in the insertion process. It is essential to optimize the insertion process by implementing appropriate indexing strategies or batch processing techniques to prevent such anomalies from occurring.\n" + }, + "393": { + "start_time": "1697354201", + "end_time": "1697354273", + "start_timestamp": "2023-10-15 15:16:41", + "end_timestamp": "2023-10-15 15:17:53", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 168\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 40\n \n # Number of rows to insert\n num_rows = 74\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analysis system, a large amount of data from 168 different sources needs to be inserted into the database simultaneously. This process involves 40 columns and 74 rows of data, where each column has a size of 56 characters.\n", + "desc": "In the IoT scenario of a smart home, there is a database called 'SmartHomeDB' that is responsible for storing and processing data from various smart devices in a home. This database contains a table called 'DeviceData' which records information from 74 smart devices, each device having 40 columns to store data. These columns may include device ID, device type (such as thermostat, security camera, smart bulb), device status, temperature, humidity, motion detection, light intensity, power consumption, and more. In this scenario, there is an anomaly triggered by executing a script that inserts a large amount of data into the 'DeviceData' table. This involves simultaneously inserting data for 168 devices, each device having 56 characters worth of data for each column. Due to the high volume of concurrent write operations, the database might experience performance issues. This could be caused by factors such as insufficient buffer space, insufficient indexing, or contention for system resources, resulting in slower write operations and potential anomalies. These anomalies could manifest as delayed or failed insertions of data, ultimately affecting the functionality and efficiency of the smart home system.\n" + }, + "394": { + "start_time": "1697354333", + "end_time": "1697354393", + "start_timestamp": "2023-10-15 15:18:53", + "end_timestamp": "2023-10-15 15:19:53", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 104\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 69\n \n # Number of rows to insert\n num_rows = 396\n \n # Size of each column (in characters)\n column_size = 98\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online platform, 104 users simultaneously attempt a frequent update operation on a database table containing 69 columns and 396 rows of records. Each column has a size of 98 characters. The users compete with each other to lock the database table in order to perform the update operation. This simulates the database exception caused by contention for locking the table under heavy concurrent access.\n", + "desc": "In a banking scenario, imagine a database called 'BankingDB' used for storing customer and transaction data. Within this database, there is a key table named 'AccountTransactions' that records detailed information about various banking transactions. This table contains 396 rows of data, with each row representing a transaction record for an account. The table has a total of 69 columns, each containing information up to 98 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more.During the day, multiple users, such as bank staff, automated systems, or customers accessing the online banking platform, may simultaneously perform frequent update operations on the 'AccountTransactions' table. These operations could involve updating transaction statuses, modifying transaction amounts, or adding transaction notes. If at a specific moment, 104 users simultaneously attempt to update the same or adjacent rows in the table, it could lead to contention for locking the database table. This locking contention might continue for some time, thereby impacting the database's performance.During peak banking hours, this contention for locks could result in delayed processing or failure of other users' transaction requests, ultimately affecting the smooth operations of the bank. Additionally, if such incidents occur frequently, they could lead to the rapid growth of the database transaction log, which may consume excessive storage space and cause temporary interruptions in database services.\n" + }, + "395": { + "start_time": "1697354453", + "end_time": "1697354554", + "start_timestamp": "2023-10-15 15:20:53", + "end_timestamp": "2023-10-15 15:22:34", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 119\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 2294463\n \n # Size of each column (in characters)\n column_size = 56\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online marketplace, if 119 users simultaneously search the database table containing 9 columns, 2,294,463 rows, each column size of 56 characters, and this search occurs after a large-scale data cleaning operation, it may result in a database exception.\n", + "desc": "In an e-commerce scenario, there is a database for an online store called 'OnlineStoreDB'. This database is responsible for storing and managing information about various products. One of the key tables in this database is called 'ProductDetails', which contains detailed information about each product, such as product ID, name, price, stock quantity, description, brand, category, and more. This table has a total of 2,294,463 rows, with each row representing a unique product entry. Additionally, there are 9 columns in this table, each with a size of up to 56 characters. Now, suppose there is a need to optimize the database for better performance and storage efficiency. To achieve this, the database administrator decides to execute a 'VACUUM' operation. This operation involves reclaiming unused space in the database, freeing up storage, and reorganizing the table data to improve performance. The 'VACUUM' command is executed with a thread count of 119, indicating that multiple threads will be used to perform the operation concurrently. This is done to speed up the process and reduce the time required for the 'VACUUM' operation.\n" + }, + "396": { + "start_time": "1697354614", + "end_time": "1697354728", + "start_timestamp": "2023-10-15 15:23:34", + "end_timestamp": "2023-10-15 15:25:28", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 79\n \n # Number of rows to insert\n num_rows = 536225\n \n # Size of each column (in characters)\n column_size = 68\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an e-commerce database containing 79 columns and 536,225 rows, each with a column size of 68 characters, a large number of indexes are created for different product attributes such as name, category, and price range. This simulation involves 9 users performing queries and then deleting these indexes. The goal is to replicate the additional storage usage and performance impact caused by maintaining redundant indexes.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'BusinessDataDB' that stores and analyzes various types of business information for a company. This database contains multiple tables, one of which is a key table named 'SalesRecords' that records detailed sales information. Within this table, there are 536,225 rows of data, each representing a sales record for a product, with a total of 79 columns, each containing information of up to 68 characters. These columns may include sales ID, product ID, customer ID, sales date, sales quantity, sales revenue, region, salesperson ID, product category, and more. In this scenario, due to the complex and diverse sales data analysis requirements of the company, the database administrator frequently creates redundant indexes on the 'SalesRecords' table before executing various analytical queries. These indexes are often created based on different combinations of columns, such as sales date, product category, or customer ID. However, the frequent creation of these redundant indexes can lead to additional storage usage and performance overhead in the database. Moreover, the continuous creation and deletion of indexes might cause database fragmentation, further impacting the performance of the business intelligence queries. This might result in delayed report generation and affect the efficiency of the decision-making process for the company.\n" + }, + "397": { + "start_time": "1697354788", + "end_time": "1697354879", + "start_timestamp": "2023-10-15 15:26:28", + "end_timestamp": "2023-10-15 15:27:59", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, there is a situation where files are being uploaded, downloaded, or edited by many users at the same time. This creates competition for input/output (I/O) resources in the file system, resulting in slower file transfers.\n", + "desc": "In a file-sharing system, the 'FileShareDB' database is used by teams or organizations to share and store files. This database not only stores the files themselves but also records metadata such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users may be simultaneously uploading, downloading, or editing files. For example, a project team collaborates on an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to the high concurrency in file operations, the 'FileShareDB' database faces challenges of I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may become significantly strained. This I/O contention can lead to slower file transfer speeds, especially in situations of limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "398": { + "start_time": "1697354939", + "end_time": "1697355000", + "start_timestamp": "2023-10-15 15:28:59", + "end_timestamp": "2023-10-15 15:30:00", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system used by a company's HR department, multiple users are attempting to perform a join operation on two large tables containing employee data. The join operation is not optimized, causing poor performance and creating CPU contention as multiple users compete for processing power.\n", + "desc": "In a business intelligence scenario, imagine a database named 'CorporateAnalyticsDB', used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights.Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "399": { + "start_time": "1697355060", + "end_time": "1697355200", + "start_timestamp": "2023-10-15 15:31:00", + "end_timestamp": "2023-10-15 15:33:20", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, this statement could be used to simulate a situation in an e-commerce platform's database where there is a large number of product inventory. The performance of querying the inventory might deteriorate because of the execution of related subqueries, which are not optimized.\n", + "desc": "In the context of an e-commerce platform, there is a database called 'ECommerceDB' that contains information about various products. Within this database, there is a table called 'ProductInventory' that stores the inventory data for these products. Each inventory record includes details such as the product ID, current stock level, last update time, supplier ID, and warehouse location. In this scenario, there is a need to query the inventory levels of products, particularly for certain categories. However, due to the size of the inventory data and the requirement to perform correlated subqueries, the performance of these queries can be slow. For example, when trying to determine the total current inventory for a specific category, the database might need to retrieve a large amount of data from disk, causing potential I/O bottlenecks.\n" + }, + "400": { + "start_time": "1697355261", + "end_time": "1697355332", + "start_timestamp": "2023-10-15 15:34:21", + "end_timestamp": "2023-10-15 15:35:32", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 53\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 55\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a manufacturing plant, there are 53 machines that are generating a large amount of data that needs to be inserted into a database. Each machine has 5 attributes, with each attribute consisting of 63 characters. There are a total of 55 records that need to be inserted. This process may cause a database exception due to the large volume of data being inserted simultaneously.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is specifically designed to handle sensor data from various devices. There is a table within the database called 'SensorReadings', which records information about the readings from different sensors. This table contains 55 rows, each representing a reading, with a total of 5 columns. These columns may include the sensor ID, reading type, reading value, timestamp, and sensor location, each column having a size of 63 characters. In this scenario, there are 53 threads or devices that are simultaneously inserting data into the 'SensorReadings' table. These devices are generating a large volume of data, and the database needs to handle the concurrent insertion of this data. However, due to factors such as the lack of sufficient buffering mechanisms or improper indexing, the database may experience performance issues in handling these simultaneous insertions. This can result in increased write latency in the database and may even lead to anomalies in the data. These anomalies can affect the overall efficiency and reliability of the IoT system.\n" + }, + "401": { + "start_time": "1697355392", + "end_time": "1697355463", + "start_timestamp": "2023-10-15 15:36:32", + "end_timestamp": "2023-10-15 15:37:43", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 53\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 32\n \n # Number of rows to insert\n num_rows = 58\n \n # Size of each column (in characters)\n column_size = 96\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, such as a scientific research project, 53 sensors are generating a large amount of data that needs to be inserted into a database table. The table consists of 32 columns, each with a size of 96 characters, and there are 58 rows of data. Simulate the database exception caused by this process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' that is used to store and analyze data from various IoT devices. This database is designed to handle a large volume of data from multiple sensors and devices. One of the key tables in this database is 'DeviceData', which records detailed information about the data collected from these devices. This table consists of 58 rows of data, where each row represents a data entry from a device, and it contains 32 columns, each with a size of 96 characters. These columns might include device ID, sensor type, reading value, timestamp, location, status, and other relevant information.In this scenario, the database is experiencing performance issues related to inserting large amounts of data into the 'DeviceData' table. When there are 53 threads simultaneously inserting data into the table, the database's ability to handle these concurrent insertions is being challenged. This can result in slower insertion times and increased write latency in the database. Additionally, without appropriate optimization measures such as batch processing or efficient buffering mechanisms, the database may encounter I/O contention due to the high volume of data being written, leading to further performance degradation.In an IoT environment, where real-time data ingestion and analysis are crucial, such performance anomalies could impact the responsiveness of data processing and analysis systems, and ultimately affect the reliability and efficiency of IoT applications and services.\n" + }, + "402": { + "start_time": "1697355523", + "end_time": "1697355583", + "start_timestamp": "2023-10-15 15:38:43", + "end_timestamp": "2023-10-15 15:39:43", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 133\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 58\n \n # Number of rows to insert\n num_rows = 247\n \n # Size of each column (in characters)\n column_size = 73\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, multiple users simultaneously attempt to perform frequent update operations in a database table containing 58 columns and 247 rows of product records, each with a column size of 73 characters. These users compete with each other to lock the database table, potentially causing a database exception. The simulation runs with 133 threads.\n", + "desc": "In this Internet of Things (IoT) scenario, let's consider a database named 'SensorDataDB', which is used for collecting and analyzing sensor data from various devices. This database is designed to handle a large volume of sensor data, and it includes a key table called 'SensorReadings'. This table contains information from 247 sensors, with each sensor having 58 columns to store data such as sensor ID, reading type, reading value, timestamp, location, and status.In this particular case, 133 devices are simultaneously transmitting data to the database at a high frequency. Due to the database locking mechanism and the lack of proper optimization, there might be contention issues among these concurrent write operations. This contention can lead to delayed processing or even failures of the write requests, impacting the reliability and efficiency of the entire system.To resolve this issue, it is essential to analyze and optimize the database locking mechanism, implement efficient data partitioning strategies, and ensure proper indexing on key columns that are frequently updated. By doing so, the database's performance can be improved, effectively avoiding or minimizing locking contention anomalies.\n" + }, + "403": { + "start_time": "1697355643", + "end_time": "1697355692", + "start_timestamp": "2023-10-15 15:40:43", + "end_timestamp": "2023-10-15 15:41:32", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 168\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2236952\n \n # Size of each column (in characters)\n column_size = 96\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a social media platform, after a large-scale data cleaning operation on a database table containing 5 columns, 2,236,952 rows, each column size of 96 characters of user records, 168 users simultaneously perform a search. Exception during the search process.\n", + "desc": "In the IoT scenario, imagine a database named 'SensorDataDB' that stores sensor data from various IoT devices. This database is designed to handle a large volume of data from sensors. The main table in the database is called 'SensorReadings' and it contains information such as sensor ID, reading type, reading value, timestamp, sensor location, and status. In this scenario, the database administrator needs to perform a 'VACUUM' operation on the 'SensorReadings' table. This operation involves reclaiming space and optimizing the table by removing fragmentation and reorganizing data pages. Due to the large size of the table, with over 2 million rows of data and 5 columns, this VACUUM operation could require significant resources and time to complete. Without proper optimization and scheduling of this operation, it could cause performance issues and disruptions in the IoT system.\n" + }, + "404": { + "start_time": "1697355752", + "end_time": "1697355866", + "start_timestamp": "2023-10-15 15:42:32", + "end_timestamp": "2023-10-15 15:44:26", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 94\n \n # Number of rows to insert\n num_rows = 620427\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a large-scale e-commerce database, when 5 users simultaneously perform a query operation on a database table containing 94 columns and 620,427 rows, each column with a size of 63 characters, redundantly created indexes for product information such as name, category, and price range will cause additional storage overhead and potentially impact the query performance.\n", + "desc": "In a business intelligence scenario, particularly in a database named 'CorporateFinanceDB' used for financial data analysis, there is a table called 'FinancialRecords' which stores detailed financial transactions and statements of a company. This table contains a large amount of data, with 620,427 rows and a total of 94 columns. The columns can represent various financial information such as transaction ID, transaction type, amount, date, department, project code, financial year, audit status, etc. To improve query performance for financial analysis, the database administrator creates multiple indexes before running complex queries. These indexes are typically based on criteria such as transaction type, date range, department, or project code. However, if the indexes are unnecessarily created or not properly managed, they can lead to redundant indexes in the database. Redundant indexes consume additional storage space, introduce performance overhead, and may cause database fragmentation. As a result, the efficiency of financial analysis queries could be negatively impacted, causing delays in generating reports and affecting the decision-making process.\n" + }, + "405": { + "start_time": "1697355926", + "end_time": "1697356017", + "start_timestamp": "2023-10-15 15:45:26", + "end_timestamp": "2023-10-15 15:46:57", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are uploading, downloading, or editing files simultaneously. As a result, there is contention for input/output (I/O) resources, leading to a slowdown in file transfer.\n", + "desc": "In a file transfer system scenario, there is a database called 'FileShareDB' used by teams or organizations to share files. This database not only stores the files themselves but also records metadata such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During peak usage periods, when multiple users are uploading, downloading, or editing files simultaneously, the system may experience IO contention. This means that the storage and network bandwidth of the system may be strained, especially when dealing with large file transfers or limited bandwidth. As a result, file transfer speeds may be slower, and database performance may be impacted by frequent write operations. This contention in IO can lead to delayed file transfers, especially when there are many concurrent users, and can also affect the recording of metadata in the database.\n" + }, + "406": { + "start_time": "1697356077", + "end_time": "1697356138", + "start_timestamp": "2023-10-15 15:47:57", + "end_timestamp": "2023-10-15 15:48:58", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a business scenario where a large amount of data is being processed, multiple users are executing join operations on a database table with poor performance. The join operations involve data from different tables, and the CPU becomes overloaded with the processing tasks, leading to a decrease in overall system performance.\n", + "desc": "In a business intelligence scenario, there is a database named 'CorporateAnalyticsDB' that is used to store and analyze various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.The performance of join queries across these tables, which are commonly used by analysts to generate comprehensive business reports, can be poor. For example, joining the 'SalesData' table with the 'CustomerProfiles' table to analyze customer purchasing behaviors or linking the data with 'ProductCatalog' and 'MarketTrends' to gain market insights can be slow. This performance issue can arise if the join keys in these tables are not effectively indexed or if the queries are not properly optimized.During peak periods, when multiple complex join queries are executed simultaneously, there can be competition for CPU resources. This can further reduce the efficiency of these queries and impact the performance of the database server. CPU contention might occur due to too many compute-intensive queries running on the server or the server's CPU resources being insufficient to handle the workload.\n" + }, + "407": { + "start_time": "1697356198", + "end_time": "1697356347", + "start_timestamp": "2023-10-15 15:49:58", + "end_timestamp": "2023-10-15 15:52:27", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform's database, there is a scenario where a large amount of data needs to be fetched, specifically the inventory for each product. In order to find the inventory, correlated subqueries are being executed. However, if these subqueries are not optimized, the performance of the inventory query may be negatively affected.\n", + "desc": "In an e-commerce platform database, there is a table called 'ProductInventory' that stores inventory information for various products. This table contains data for tens of thousands or even hundreds of thousands of products, including their product ID, current stock level, last inventory update time, supplier ID, and warehouse location. When querying the inventory level of products, a common query involves retrieving the total current inventory of all products within a specific category. This type of query requires performing related subqueries, which can become inefficient when dealing with a large number of products. For example, if a category contains thousands of products, executing individual subqueries for each product can be time-consuming and lead to I/O bottlenecks.\n" + }, + "408": { + "start_time": "1697356408", + "end_time": "1697356480", + "start_timestamp": "2023-10-15 15:53:28", + "end_timestamp": "2023-10-15 15:54:40", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 182\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 69\n \n # Size of each column (in characters)\n column_size = 70\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, if a large amount of data needs to be inserted into a database simultaneously by 182 threads, where each row contains 16 columns of data with a column size of 70 characters, there is a possibility of encountering a database exception due to the high workload.\n", + "desc": "In this file sharing system scenario, we can envision a database named 'TeamFileShareDB', which is a system used by teams or organizations for sharing files. This database not only stores the files themselves but also records the metadata of the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical workday, multiple users might be simultaneously uploading, downloading, or editing files. For example, a project team is collaborating to complete an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. Due to such high concurrency in file operations, the database 'TeamFileShareDB' faces the challenge of inserting a large volume of data. When 182 users simultaneously attempt to insert 69 rows of data into the database, the system may encounter performance issues and contention for system resources. This can lead to slower data insertion speeds, especially if the server's processing capabilities are limited or if proper optimization measures are not implemented. As a result, users may experience delays in file uploads or file metadata recording, which can impact the efficiency and usability of the file sharing system.\n" + }, + "409": { + "start_time": "1697356540", + "end_time": "1697356612", + "start_timestamp": "2023-10-15 15:55:40", + "end_timestamp": "2023-10-15 15:56:52", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 182\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 94\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data logging system, a large amount of data generated by 182 sensors needs to be inserted into the database simultaneously. The database table contains 20 columns and 94 rows, with each column size of 50 characters. This simulates the database exception caused by the high volume of data being inserted.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database used for collecting and analyzing sensor data in a smart home environment named 'SmartHomeDB'. This database stores information from various sensors installed in the home, such as temperature, humidity, light, motion, and more. The primary table in this database is called 'SensorReadings' and it contains 94 rows of data, representing the sensor readings from different areas of the home, with a total of 20 columns. These columns include sensor ID, sensor type, reading value, timestamp, location, and other relevant information. Imagine a situation where 182 sensors in the smart home start transmitting data simultaneously at a high frequency. Due to the lack of optimization in the 'SensorReadings' table, such as efficient data partitioning or appropriate indexing, the database's ability to handle these concurrent write requests is limited. This can result in increased write latency in the database, leading to anomalies. As a consequence, there may be delays or failures in processing sensor data, affecting the overall performance and efficiency of the smart home system.In a real-life scenario, let's consider a database used by a bank for managing customer and transaction data. The database is named 'BankingDB' and contains a table called 'AccountTransactions' which records detailed information about various banking transactions. This table consists of 253 rows of data, each representing a transaction record for a customer account, with a total of 52 columns, each containing information of up to 95 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, timestamp, counterparty account information, transaction status, employee ID, transaction location, currency type, and more.During the bank's busy hours, multiple users, such as bank staff, automated systems, or customers through an online banking platform, may simultaneously attempt frequent update operations on the 'AccountTransactions' table. For example, they may be updating transaction statuses, modifying transaction amounts, or adding transaction notes. In this scenario, a specific situation arises where 186 users attempt to perform such operations concurrently. Due to the design of the table and the database's locking mechanism, these concurrent update operations create a competition among users for locking the database table. This can cause contention among users for database resources, resulting in delayed or failed transaction requests and ultimately affecting the smooth operation of the bank.It is worth noting that the scenario presented above is generated randomly and does not represent a specific real-life case.\n" + }, + "410": { + "start_time": "1697356673", + "end_time": "1697356733", + "start_timestamp": "2023-10-15 15:57:53", + "end_timestamp": "2023-10-15 15:58:53", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 186\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 52\n \n # Number of rows to insert\n num_rows = 253\n \n # Size of each column (in characters)\n column_size = 95\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a multi-user environment, 186 users are simultaneously trying to update a database table containing 52 columns and 253 rows of data, with each column having a size of 95 characters. These users are competing with each other to lock the table for performing the update operation, which may cause a database exception due to lock contention.\n", + "desc": "In a business intelligence scenario, a database called 'BusinessIntelDB' is used to store and analyze various business data. This database contains a key table named 'DataRecords', which records detailed information about business data. The table consists of 3,185,326 rows of data, each representing a specific data record, with a total of 16 columns, each containing information of up to 50 characters. These columns may include data ID, data type (such as sales, expenses, profits), data value, date, department, location, source, and more. In this specific scenario, a vacuum operation needs to be performed on the 'DataRecords' table. Vacuuming is a process in which the database reclaims unused space and improves the overall performance of the database by removing deleted or outdated data and reorganizing the table structures. However, performing a vacuum operation on such a large table with a high number of threads (122 in this case) can put significant pressure on the database system. This can lead to increased CPU usage and potentially impact other database operations being performed simultaneously. Therefore, it is important to carefully manage and schedule the vacuum operation to minimize its impact on the overall system performance.\n" + }, + "411": { + "start_time": "1697356793", + "end_time": "1697356855", + "start_timestamp": "2023-10-15 15:59:53", + "end_timestamp": "2023-10-15 16:00:55", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 122\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 16\n \n # Number of rows to insert\n num_rows = 3185326\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, there is a need for a large-scale data cleaning operation. This script simulates the scenario where 122 users search the database table, which contains 16 columns, 3,185,326 rows, and each column has a size of 50 characters. However, if the database lacks proper index optimization, this can result in an exception occurring during the search process.\n", + "desc": "In an e-commerce platform's database, there is a table called 'ProductCatalog' that stores information about various products. This table contains a large number of rows (942,481) and columns (76), each representing a unique product entry. The columns can store up to 87 characters of information and may include details such as product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other related attributes. In the platform's database, there is an issue related to redundant index creation. Typically, to optimize the performance of complex queries, multiple indexes are created on specific columns. These indexes might be based on factors such as product category, brand, or price range. However, in some cases, multiple redundant indexes might be created, which can lead to increased storage usage and performance overhead. This can affect the efficiency of query execution and result in delayed report generation, particularly in a business intelligence environment.\n" + }, + "412": { + "start_time": "1697356915", + "end_time": "1697357029", + "start_timestamp": "2023-10-15 16:01:55", + "end_timestamp": "2023-10-15 16:03:49", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 76\n \n # Number of rows to insert\n num_rows = 942481\n \n # Size of each column (in characters)\n column_size = 87\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a large database with 76 columns and 942,481 rows, each with a column size of 87 characters, an excessive number of indexes are created for various attributes such as product name, category, and price range. This leads to extra storage usage and performance overhead.\n", + "desc": "In a file sharing system scenario, there is a database named 'TeamFileShareDB' that is used by teams or organizations for sharing files. This database not only stores the files themselves but also records metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During a typical workday, multiple users are likely to simultaneously upload, download, or edit files. For example, a project team might be collaborating on an important report, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system might be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. The high concurrency in file operations can lead to I/O (input/output) contention in the 'TeamFileShareDB' database. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth can be significantly strained. This I/O contention can result in slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter locking and transaction management issues, which further slow down file processing and metadata recording.\n" + }, + "413": { + "start_time": "1697357089", + "end_time": "1697357180", + "start_timestamp": "2023-10-15 16:04:49", + "end_timestamp": "2023-10-15 16:06:20", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are uploading, downloading, or editing files at the same time. This creates a scenario where there is contention for input/output (I/O) resources, which slows down the file transfer process.\n", + "desc": "In a Business Intelligence (BI) scenario, imagine a database named 'CorporateAnalyticsDB', used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information.In this scenario, the company's analysts frequently need to perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. At the same time, they might also need to link these data with 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) to gain deeper market insights. Given the large size of these tables and the involvement of multi-table joins, the performance of these queries can become very slow. If the join keys in these tables are not effectively indexed, or if the queries are not properly optimized, then executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, this can lead to competition for CPU resources, further reducing query efficiency. The CPU contention might occur due to too many compute-intensive queries running on the database server, or the server's CPU resources being insufficient to handle these queries.\n" + }, + "414": { + "start_time": "1697357240", + "end_time": "1697357300", + "start_timestamp": "2023-10-15 16:07:20", + "end_timestamp": "2023-10-15 16:08:20", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a large-scale data analysis system, multiple parallel join operations are performed on a database table with poor join performance. This results in CPU contention, where multiple users compete for system resources, causing a slowdown in overall performance.\n", + "desc": "In an IoT scenario, let's consider a database called 'IoTDataDB' that is used to store and analyze data collected from various IoT devices. This database contains a main table named 'DeviceReadings' that stores data readings from different sensors. Each row in this table represents a reading from a specific sensor and includes information such as sensor ID, timestamp, location, reading value, and sensor type. In this database, there might be situations where users need to fetch a large amount of data from the 'DeviceReadings' table using correlated subqueries. For example, they might want to retrieve data from all sensors located in a specific geographic area or all readings taken within a certain time range. Due to the size of the table and the complexity of the subqueries required, performing these fetch operations can be time-consuming and inefficient. This can result in slow query speeds and increased database load, especially when dealing with a large volume of data.\n" + }, + "415": { + "start_time": "1697357360", + "end_time": "1697357510", + "start_timestamp": "2023-10-15 16:09:20", + "end_timestamp": "2023-10-15 16:11:50", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, a database in an e-commerce platform is experiencing performance issues when trying to fetch large amounts of data by executing correlated subqueries. This particularly impacts the process of retrieving inventory information for each product, causing delays and slowing down the overall system performance.\n", + "desc": "In an Internet of Things (IoT) scenario, a database named 'IoTDataDB' is used to store and process data collected from a network of 152 IoT devices. These devices are configured to collect various types of sensor data, such as temperature, humidity, pressure, light, and motion. The data generated by these sensors is stored in a table named 'SensorData', which contains 55 rows of data, each representing a specific data reading, with 12 columns, each capable of storing up to 38 characters. These columns may include sensor ID, sensor type, data value, timestamp, location, and other related information. \n" + }, + "416": { + "start_time": "1697357570", + "end_time": "1697357642", + "start_timestamp": "2023-10-15 16:12:50", + "end_timestamp": "2023-10-15 16:14:02", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 152\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 55\n \n # Size of each column (in characters)\n column_size = 38\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT application, a large amount of data generated by 152 sensors needs to be inserted into the database simultaneously. This process simulates a database exception caused by the insertion of data into a database table containing 12 columns, 55 rows, and each column having a size of 38 characters.\n", + "desc": "During normal operation, these IoT devices transmit data to the database at a high frequency, generating a large volume of data. However, when all 152 devices simultaneously start transmitting data, it can overwhelm the database's performance. Due to the lack of optimization measures in the database, such as proper indexing, efficient data storage techniques, or suitable data partitioning strategies, the database may struggle to handle the large influx of data. As a result, the database's write operations might become slow or even fail, leading to anomalies in data insertion. This can impact real-time monitoring, data analysis, and other IoT-related applications that rely on timely access to this sensor data.\n" + }, + "417": { + "start_time": "1697357702", + "end_time": "1697357774", + "start_timestamp": "2023-10-15 16:15:02", + "end_timestamp": "2023-10-15 16:16:14", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 152\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 33\n \n # Number of rows to insert\n num_rows = 95\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) system, 152 sensors generate a large amount of data that needs to be inserted into a database simultaneously. This data insertion process causes contention and can result in exceptions in the database. The database table has 33 columns with a column size of 67 characters and a total of 95 rows.\n", + "desc": "In an Internet of Things (IoT) scenario, we can imagine a smart home system where multiple sensors are installed to monitor various aspects of the home environment, such as temperature, humidity, motion, and light. These sensors generate a large amount of data that needs to be stored and analyzed in a database, let's call it 'SmartHomeDB'. One of the key tables in this database is 'SensorData', which stores the data from these sensors. In this specific scenario, the database is expected to handle a high volume of sensor data, with 95 rows of data being inserted for each sensor at a frequency of 152 threads. Each row in the 'SensorData' table represents a data reading from a sensor and contains 33 columns, each with a size of 67 characters. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and other relevant information.However, due to the large number of concurrent insert operations and the size of the data being inserted, the database might encounter performance issues. This could be caused by factors such as insufficient disk I/O bandwidth, limited memory resources, or inefficient data insertion methods. Such performance issues can lead to increased latency in data insertion, delayed processing of sensor data, or even database crashes.In this scenario, the 'INSERT_LARGE_DATA' anomaly refers to the performance issues that can arise when inserting a large volume of sensor data into the 'SensorData' table. The high frequency of insert operations and the large amount of data being inserted can strain the database's resources and impact its overall performance.\n" + }, + "418": { + "start_time": "1697357834", + "end_time": "1697357895", + "start_timestamp": "2023-10-15 16:17:14", + "end_timestamp": "2023-10-15 16:18:15", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 123\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 62\n \n # Number of rows to insert\n num_rows = 303\n \n # Size of each column (in characters)\n column_size = 73\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online database, 123 users simultaneously try to update a table containing 62 columns and 303 rows of data. Each column has a size of 73 characters. These users compete with each other to lock the table for performing update operations, resulting in potential database exceptions.\n", + "desc": "In this scenario, we can imagine a database in a file sharing system, named 'FileShareDB'. This database is used by teams or organizations for sharing files and storing file metadata. It records information such as file uploader, file size, creation and modification dates, version history, access permissions, and download counts. At a specific moment, 123 users are simultaneously uploading, downloading, or editing files on the file sharing system. Due to the high concurrency of file operations, the 'FileShareDB' database might encounter locking contention. This means that multiple users may compete for locks on the same or adjacent rows in the database table. If this contention lasts for a significant amount of time, it can lead to performance issues in the database. During peak file sharing periods, such prolonged locking can cause delays or failures in file transfer operations, affecting the overall efficiency of the file sharing system.\n" + }, + "419": { + "start_time": "1697357955", + "end_time": "1697358071", + "start_timestamp": "2023-10-15 16:19:15", + "end_timestamp": "2023-10-15 16:21:11", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 61\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 2467885\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a large online platform, 61 users are simultaneously performing searches after a database cleaning operation. The database table contains 12 columns and 2,467,885 rows of product records, with each column having a size of 67 characters. This scenario simulates the exception that can occur when there is a high concurrency of users searching the database after a data cleaning process.\n", + "desc": "In the database of an online store, called 'OnlineStoreDB', there is a table named 'ProductData' that stores information about various products. This table consists of 2,467,885 rows, with each row representing a product entry. The table has 12 columns, including product ID, name, price, stock quantity, description, brand, category, weight, production date, expiration date, supplier information, and country. Each column can hold up to 67 characters of data. During a specific operation, 61 threads are simultaneously performing a database clean-up operation known as 'VACUUM'. This operation involves reorganizing and optimizing the database by removing unnecessary space and reordering data to improve performance. However, due to the large number of threads and the size of the table, this VACUUM operation can put a significant load on the database, potentially leading to performance issues and anomalies in the system. It is important to carefully plan and optimize this operation to avoid any negative impacts on the functionality and efficiency of the entire online store.\n" + }, + "420": { + "start_time": "1697358131", + "end_time": "1697358247", + "start_timestamp": "2023-10-15 16:22:11", + "end_timestamp": "2023-10-15 16:24:07", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 6\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 86\n \n # Number of rows to insert\n num_rows = 779666\n \n # Size of each column (in characters)\n column_size = 78\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used for managing inventory in an online store, there is an issue where redundant indexes are created for various attributes of products such as name, category, and price range. This can lead to increased storage usage and slower performance. The simulation involves 6 users searching in the database table which has 86 columns and 779,666 rows. Each column is of size 78 characters.\n", + "desc": "In a business intelligence scenario, particularly in the context of financial analysis, there is a database called 'FinancialAnalyticsDB' that stores and processes financial data for analysis. Within this database, there is a key table named 'FinancialData' that contains a large amount of financial data, with 779,666 rows of data. Each row represents a financial record, and there are a total of 86 columns, with each column being able to store up to 78 characters of information. These columns may include transaction ID, transaction type, transaction amount, date, department, project code, financial year, audit status, and other relevant financial information.In the process of performing financial analysis, it is necessary to quickly respond to complex query demands, such as generating financial reports or conducting trend analysis. To improve query performance, the database administrator may create multiple indexes before running the queries. These indexes could be based on transaction type, date range, department, or project code.Suppose at a specific moment, 6 users simultaneously conduct complex financial queries on the 'FinancialData' table. In an effort to enhance query performance, the administrator creates multiple indexes before the queries start and then deletes them after the queries are completed.However, the frequent creation and deletion of indexes can result in additional storage usage and performance overhead in the database. Moreover, this process may lead to database fragmentation, which can further impact performance. Ultimately, this can result in delayed generation of financial reports and affect the efficiency of decision-making processes in the business intelligence environment.\n" + }, + "421": { + "start_time": "1697358307", + "end_time": "1697358398", + "start_timestamp": "2023-10-15 16:25:07", + "end_timestamp": "2023-10-15 16:26:38", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are uploading, downloading, or editing files simultaneously, there is contention for I/O resources. This results in sluggish file transfers and a slow file sharing process.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'SmartHomeDB', which is responsible for storing and managing the data generated by various sensors in a smart home. This database is designed to handle a large volume of data from sensors such as temperature sensors, motion sensors, door/window sensors, and smart appliances. It contains multiple tables, including a key table named 'SensorData', which records the sensor readings. The 'SensorData' table consists of a large number of rows, each representing a reading from a sensor, and contains various columns such as sensor ID, sensor type, reading value, timestamp, and location information. In this particular scenario, due to the IoT devices frequently generating data and sending it to the database for storage and analysis, the database faces challenges related to inserting large amounts of data and I/O (input/output) contention. When a large number of IoT devices simultaneously upload data to the database, the system's storage and network bandwidth might be strained, resulting in slower data insertion speeds. Additionally, frequent write operations in the database, such as data insertion and indexing, can impact database performance. This, in turn, can lead to I/O contention, meaning that the database server's I/O capabilities and resources are insufficient to handle the high influx of data. As a result, the database might experience slower data insertion, delays in processing other operations, and potential performance issues.\n" + }, + "422": { + "start_time": "1697358458", + "end_time": "1697358518", + "start_timestamp": "2023-10-15 16:27:38", + "end_timestamp": "2023-10-15 16:28:38", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a cloud-based data analytics platform, multiple users are performing join operations on large datasets using CPU-intensive algorithms. The high CPU usage and inefficient join algorithms lead to poor performance in terms of query execution time and resource contention.\n", + "desc": "In a business intelligence scenario, there exists a database called 'CorporateAnalyticsDB', which is used for storing and analyzing various business data of a large corporation. This database consists of multiple complex tables, such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each containing a large number of rows and columns to record detailed business information. In this particular scenario, analysts frequently perform complex join queries across multiple tables to generate comprehensive business reports. For example, they might join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. Furthermore, they might also need to link these data with the 'ProductCatalog' (the product catalog) and 'MarketTrends' (market trends) tables to gain deeper market insights. Due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become significantly slow. The lack of effective indexing on the join keys or inadequate query optimization can result in time and resource-consuming join operations. During peak periods, when multiple complex join queries are executed simultaneously, there could be competition for CPU resources, which further reduces query efficiency. CPU contention may occur due to either excessively compute-intensive queries running on the database server or insufficient CPU resources to handle the workload.\n" + }, + "423": { + "start_time": "1697358578", + "end_time": "1697358719", + "start_timestamp": "2023-10-15 16:29:38", + "end_timestamp": "2023-10-15 16:31:59", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online retail system, when retrieving information about the inventory of each product, there is a need to execute related subqueries. If the optimization of these subqueries is not done properly, the performance of the query may be affected when dealing with a large number of products.\n", + "desc": "In an e-commerce platform's database called 'ECommerceDB', there is a key table named 'ProductInventory' that stores information about the inventory of various products. This table contains inventory data for tens of thousands or even hundreds of thousands of products. Each product's inventory information includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. When querying the inventory level of each product, related subqueries are used. For example, a common query might be to determine the total current inventory of all products within a specific category. This query involves selecting all products of a particular category from the 'ProductDetails' table and conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. However, when the number of products is very large, the performance of these related subqueries can become inefficient. In particular, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. This inefficiency is mainly due to the need to retrieve inventory information for a large number of products, which requires reading a significant amount of data from the disk, resulting in potential I/O bottlenecks.\n" + }, + "424": { + "start_time": "1697358780", + "end_time": "1697358851", + "start_timestamp": "2023-10-15 16:33:00", + "end_timestamp": "2023-10-15 16:34:11", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 106\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 85\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an Internet of Things (IoT) application, when there are 106 sensors generating a large amount of data that needs to be inserted into a database simultaneously, with each data entry having 9 columns and a column size of 66 characters, and a total of 85 rows, an exception is simulated due to the high workload and potentially insufficient resources to handle the insertion process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically designed for collecting and analyzing sensor data called 'SensorDataDB'. This database is responsible for handling a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which stores data from 85 sensors. The table contains 9 columns including sensor ID, reading type, reading value, timestamp, sensor location, and status information. When 106 sensors start transmitting data simultaneously at a high frequency, it can cause performance issues in the database. Due to potential lack of effective data partitioning, insufficient buffering mechanisms, or improper indexing, the database's ability to process simultaneous write requests is limited. This can lead to increased write latency and even database locking, resulting in anomalies.\n" + }, + "425": { + "start_time": "1697358911", + "end_time": "1697358983", + "start_timestamp": "2023-10-15 16:35:11", + "end_timestamp": "2023-10-15 16:36:23", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 106\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 26\n \n # Number of rows to insert\n num_rows = 92\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an IoT system, there are 106 devices generating a large amount of data that needs to be inserted into a database simultaneously. The database table has 26 columns with each column having a size of 67 characters, and there are 92 rows. This simulation represents the database exception that can occur when inserting such a large amount of data.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' specifically designed for storing and processing sensor data. This database handles a large volume of data from various types of sensors, such as temperature, humidity, pressure, light, and motion sensors. Within this database, there is a key table named 'SensorReadings', which contains detailed information about the sensor readings. This table consists of 92 rows of data, each row representing a sensor reading, with a total of 26 columns. These columns store information such as sensor ID, reading type, reading value, timestamp, sensor location, and status information.In this specific case, the database is experiencing issues related to inserting a large amount of data. The user wants to insert a significant number of new sensor readings into the 'SensorReadings' table, which involves inserting 106 rows of data. Each row contains 26 columns, with each column capable of storing up to 67 characters of information.However, due to the lack of efficient data insertion mechanisms, improper indexing, or insufficient database resources, the insertion of such a large amount of data can cause performance issues. This could result in slower insertion speeds, increased resource consumption, or even database locking. These anomalies might lead to delays in processing other data insertion requests, potential data loss, or interrupted sensor data collection.To address these issues, appropriate measures such as optimizing data insertion processes, implementing efficient buffering mechanisms, or increasing the database resources may be necessary. These actions can help ensure smooth and efficient insertion of large volumes of sensor data into the database.\n" + }, + "426": { + "start_time": "1697359043", + "end_time": "1697359103", + "start_timestamp": "2023-10-15 16:37:23", + "end_timestamp": "2023-10-15 16:38:23", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 197\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 52\n \n # Number of rows to insert\n num_rows = 267\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, 197 users simultaneously attempt to perform frequent update operations in a database table containing 52 columns and 267 rows of product records, each with a column size of 67 characters. These users compete with each other to lock the database table to perform the update operation, potentially causing contention and leading to a database exception.\n", + "desc": "In the banking industry, there is a database used for managing customer and transaction data called 'BankDB'. This database contains a table called 'TransactionRecords' which records various information about transactions made by customers. There are 267 rows in this table, each representing a transaction, with a total of 52 columns, with each column containing information of up to 67 characters. These columns might include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty details, transaction status, employee ID (of the bank staff who processed the transaction), location, and more. At a specific moment, 197 users of the bank, such as customers and bank staff, might simultaneously try to access and perform operations on the 'TransactionRecords' table. Due to the high concurrency and the database's locking mechanism, there might be contention for resources, specifically, locking the table during simultaneous operations. If this locking lasts for a considerable amount of time, it can result in performance issues in the database. During peak business hours, such contention can lead to delayed processing or failure of other users' transaction requests, impacting the daily operations of the bank. It may also cause an increase in the size of the transaction log, occupying excessive storage space, and potentially lead to temporary interruptions in database services.\n" + }, + "427": { + "start_time": "1697359163", + "end_time": "1697359225", + "start_timestamp": "2023-10-15 16:39:23", + "end_timestamp": "2023-10-15 16:40:25", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 163\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 9\n \n # Number of rows to insert\n num_rows = 3296063\n \n # Size of each column (in characters)\n column_size = 67\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online platform, a large number of users are searching for products using various criteria such as product name, category, and price range. This script simulates a scenario where 163 users perform simultaneous searches on a database table containing 9 columns and 3,296,063 rows, with each column having a size of 67 characters. The purpose is to trigger a database exception by compressing and reclaiming wasted space in the table through the VACUUM operation and observing the impact on the search performance.\n", + "desc": "In an e-commerce scenario, there is a database designed for an online store called 'OnlineStoreDB'. This database is used to store information about various products available on the online store. One of the main tables in this database is 'ProductRecords', which contains detailed information about the products. This table consists of 3,296,063 rows, with each row representing a unique product entry. The table has a total of 9 columns, each column capable of storing up to 67 characters. The columns in this table include product ID, name, price, stock quantity, brand, category, description, image link, and product status.The 'VACUUM' operation in a database is used to reclaim storage space, optimize database performance, and improve query response time. It works by removing unnecessary or outdated data from the database, reorganizing the data to improve storage efficiency, and updating statistics and index information. In the given scenario, the 'VACUUM' operation is being performed on the 'ProductRecords' table of the 'OnlineStoreDB' database. This could be due to various reasons, such as data cleanup, removal of obsolete or expired products, or consolidation of data in the table. However, the execution of the 'VACUUM' operation with these parameters may pose challenges. With 163 concurrent threads executing the 'VACUUM' operation, the database server might face increased CPU and I/O contention. This could impact the performance of other ongoing database operations and potentially cause delays or failures in query processing or data insertion. To ensure smoother execution of the 'VACUUM' operation, it is recommended to consider factors such as the system's CPU and I/O capabilities, scheduling the operation during off-peak hours, and optimizing the vacuuming process to minimize resource contention.\n" + }, + "428": { + "start_time": "1697359286", + "end_time": "1697359400", + "start_timestamp": "2023-10-15 16:41:26", + "end_timestamp": "2023-10-15 16:43:20", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 63\n \n # Number of rows to insert\n num_rows = 421698\n \n # Size of each column (in characters)\n column_size = 88\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database used for an online marketplace, there are 10 users performing a query operation on a table containing 63 columns and 421,698 rows of data. Each column has a size of 88 characters. These users are creating redundant indexes for various attributes like product name, category, and price range. This process can result in additional storage usage and performance overhead.\n", + "desc": "In a business intelligence scenario, there is a database named 'AnalyticalDB' used for storing and analyzing data from various sources. This database contains multiple tables, including a key table named 'DataRecords', which stores detailed information about different data records. The 'DataRecords' table consists of 421,698 rows, each representing a data record, with a total of 63 columns. These columns include attributes such as data ID, data type, source, date, location, quality, and other relevant information. In the process of data analysis and exploration, database administrators might attempt to create redundant indexes to accelerate query performance. These indexes might be created based on various criteria, such as data type, source, date range, or specific columns. However, excessive creation of redundant indexes without proper optimization can lead to additional storage usage and overhead in the database. Furthermore, frequent index operations might cause database fragmentation and impact overall performance. In a business intelligence environment, this could result in delayed and inefficient query execution, hindering data analysis and decision-making processes.\n" + }, + "429": { + "start_time": "1697359460", + "end_time": "1697359551", + "start_timestamp": "2023-10-15 16:44:20", + "end_timestamp": "2023-10-15 16:45:51", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a system where multiple users are sharing files, there is a high competition for input/output (I/O) resources. This leads to a slowdown in file transfer when multiple users are simultaneously uploading, downloading, or editing files.\n", + "desc": "In the file sharing system of an organization, there is a database called 'TeamFileShareDB' which is used for storing and managing shared files. This database contains not only the files themselves but also metadata such as file size, uploaders' information, access permissions, creation and modification dates, and version history. Throughout the day, multiple users might be simultaneously uploading, downloading, or editing files, resulting in a high level of concurrency in file operations. For instance, a project team could be collaborating on a report, constantly uploading the latest versions while others download them for review or editing. Furthermore, the system may also handle large file uploads, such as presentations, video conference recordings, or design drawings. This concurrent activity puts a strain on the database's input/output (I/O) capabilities. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth may become overwhelmed. This I/O contention can lead to slower file transfer speeds, particularly in situations with limited bandwidth or insufficient server processing power. Moreover, the frequent write operations in the database, including file uploads and metadata updates, can affect the overall performance of the database. During peak periods, the database may experience locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "430": { + "start_time": "1697359611", + "end_time": "1697359672", + "start_timestamp": "2023-10-15 16:46:51", + "end_timestamp": "2023-10-15 16:47:52", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database used for customer relationship management, there is a poor performance issue when joining two large tables due to a lack of optimization. This results in high CPU contention, where multiple users compete for CPU resources, leading to processing delays.\n", + "desc": "In a business intelligence scenario, specifically in the database 'CorporateAnalyticsDB', which is used for analyzing business data, the performance of join operations between multiple tables is poor. This performance issue is primarily observed when complex queries involving multiple tables and joining keys are executed. Due to the large size of these tables and the absence of effective indexing or query optimization, executing these join operations consumes a significant amount of time and resources. Additionally, during peak periods with multiple concurrent complex join queries, there is contention for CPU resources, further impacting the efficiency of these queries. This CPU contention can occur either due to the presence of too many compute-intensive queries running on the database server or due to insufficient CPU resources on the server itself.\n" + }, + "431": { + "start_time": "1697359732", + "end_time": "1697359881", + "start_timestamp": "2023-10-15 16:48:52", + "end_timestamp": "2023-10-15 16:51:21", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In a real-life scenario, this script is simulating a situation in an e-commerce platform's database where a large amount of data needs to be fetched and correlated subqueries are used. This could be similar to a situation where an online retailer needs to retrieve information about product inventory and sales, but the queries are not optimized for efficiency. As a result, the performance of querying inventory data is negatively affected due to the use of correlated subqueries.\n", + "desc": "In a database used in an e-commerce platform, there is a table called 'ProductInventory' that stores information about the inventory levels of various products. This table contains data for tens of thousands or even hundreds of thousands of products. Each product's inventory information includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. When querying the inventory level of each product, the database may need to perform correlated subqueries. For example, when querying the total current inventory of products in a specific category, the database would first select all products in that category and then perform subqueries on the 'ProductInventory' table to obtain the inventory data for those products. However, when there is a large number of products in a category, these correlated subqueries can become inefficient. It would take a long time to execute individual subqueries for each product to retrieve inventory information. As a result, the database might need to read a significant amount of data from the disk, leading to I/O bottlenecks.\n" + }, + "432": { + "start_time": "1697359942", + "end_time": "1697360014", + "start_timestamp": "2023-10-15 16:52:22", + "end_timestamp": "2023-10-15 16:53:34", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 114\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 59\n \n # Size of each column (in characters)\n column_size = 79\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a smart home system, 114 IoT devices are generating a large amount of data that needs to be inserted into the database simultaneously. Each device has 12 columns of data, with each column containing 79 characters. There are a total of 59 records being inserted. Simulate the database exception caused by this process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database specifically used for collecting and analyzing sensor data in smart homes, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from 59 sensors. These fields may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, sensor location, and status information. When all 114 sensors start transmitting data simultaneously at a very high frequency, the database might encounter performance issues. Due to the lack of effective data partitioning in the 'SensorReadings' table, insufficient buffering mechanisms, or improper indexing, the database's ability to process these numerous concurrent write requests is limited. This can lead to increased write latency in the database, and in some cases, may even result in database locking, ultimately leading to anomalies.\n" + }, + "433": { + "start_time": "1697360074", + "end_time": "1697360145", + "start_timestamp": "2023-10-15 16:54:34", + "end_timestamp": "2023-10-15 16:55:45", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 114\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 30\n \n # Number of rows to insert\n num_rows = 100\n \n # Size of each column (in characters)\n column_size = 95\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive system, 114 sensors are generating a large amount of data simultaneously, and this data needs to be inserted into the database. The data consists of 30 columns, with each column having a size of 95 characters, and there are 100 records in total. This process may cause an exception in the database due to the high volume of data being inserted at once.\n", + "desc": "In an e-commerce platform, there is a database named 'ECommerceDB' that handles the storage and processing of product data. This database contains a crucial table called 'ProductData', which stores detailed information about various products. Each product is represented by a row in this table, with 30 columns including product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other relevant attributes. With a total of 100 rows, this table holds a considerable amount of data. The 'INSERT_LARGE_DATA' anomaly refers to a situation where a large amount of new data is being inserted into this table simultaneously by 114 threads. This could be due to a high demand for adding new products to the platform or bulk imports from suppliers. However, without proper optimization measures, such as efficient data partitioning, batch processing, or optimal use of indexes, this mass-insertion operation can put a strain on the database's performance and might lead to anomalies or performance issues.\n" + }, + "434": { + "start_time": "1697360205", + "end_time": "1697360266", + "start_timestamp": "2023-10-15 16:56:45", + "end_timestamp": "2023-10-15 16:57:46", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 200\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 57\n \n # Number of rows to insert\n num_rows = 398\n \n # Size of each column (in characters)\n column_size = 97\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, simulate a scenario where 200 users simultaneously try to update a table containing 57 columns and 398 rows, with each column having a size of 97 characters. These users compete with each other to lock the table, resulting in contention and potentially causing an exception in the database.\n", + "desc": "In the banking scenario, there is a database called 'BankingDB' used for handling customer and transaction data. Within this database, there is a key table named 'AccountTransactions' that records detailed information about various banking transactions. This table consists of 398 rows of data, each representing a transaction record for an account. It has a total of 57 columns, each containing information of up to 97 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more. On a typical banking business day, there may be multiple users (such as bank staff or customers through an online banking platform) who simultaneously attempt to perform frequent update operations on the 'AccountTransactions' table. These operations could involve updating transaction statuses, modifying transaction amounts, or adding transaction notes. In a specific scenario, 200 users are concurrently attempting to update the same or adjacent rows in the 'AccountTransactions' table. Due to the table's design and the database's locking mechanism, this could result in contention and competition for locking the database table. If the locking lasts for an extended period, it could lead to performance issues in the database. During peak hours, prolonged locking could cause delays or failures in other users' transaction requests, thereby impacting the daily operations of the bank. Additionally, frequent occurrences of such incidents could result in rapid growth in the database transaction log, consuming excessive storage space, and causing temporary interruptions in database services.\n" + }, + "435": { + "start_time": "1697360326", + "end_time": "1697360377", + "start_timestamp": "2023-10-15 16:58:46", + "end_timestamp": "2023-10-15 16:59:37", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 124\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 12\n \n # Number of rows to insert\n num_rows = 3238784\n \n # Size of each column (in characters)\n column_size = 95\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online marketplace, there is a process command that triggers the \"VACUUM\" anomaly scenario. This scenario involves simulating a situation where 124 users are actively performing searches on a table with 12 columns and 3,238,784 rows of product records. Each column has a size of 95 characters. The purpose of this simulation is to expose any exceptions or issues that may arise when multiple users search the database after a database cleanup process.\n", + "desc": "In an e-commerce platform's database, there is a database specifically used for an online store named 'OnlineStoreDB'. This database contains a key table named 'ProductRecords', which records detailed information about various products. This table consists of approximately 3,238,784 rows of data, with each row representing an independent product entry. The table contains 12 columns, with each column containing information of up to 95 characters. These columns may include product ID, name, price, stock quantity, description, brand, category, size, color, weight, supplier information, and country. At a certain moment, due to business needs such as product updates or data consolidation, there is a requirement to perform a vacuum operation. This vacuum operation involves reclaiming unused space and optimizing the table's storage. If not performed with due care and consideration, such as properly scheduling the operation during low traffic periods or implementing optimization techniques to minimize the impact, performing the vacuum operation on such a large-scale table could cause anomalies. These anomalies could affect other database operations, such as the insertion of new products, price updates, or user queries, thereby impacting the overall performance and efficiency of the online store.\n" + }, + "436": { + "start_time": "1697360437", + "end_time": "1697360552", + "start_timestamp": "2023-10-15 17:00:37", + "end_timestamp": "2023-10-15 17:02:32", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 7\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 90\n \n # Number of rows to insert\n num_rows = 440562\n \n # Size of each column (in characters)\n column_size = 87\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In an online marketplace database with 90 columns and 440,562 rows, each with a column size of 87 characters, a large number of indexes are created for various attributes such as product name, category, and price range. However, these indexes are redundant and do not significantly improve the performance of queries. Running 7 threads, this script simulates the storage overhead and potential performance degradation caused by maintaining the redundant indexes.\n", + "desc": "In an e-commerce platform database, an anomaly occurs due to the creation of redundant indexes. The database named 'ECommerceDB' is used for storing and processing product information. One of the key tables in this database is 'ProductCatalog', which contains detailed information about various products. This table consists of 440,562 rows of data, each representing a unique product, with a total of 90 columns. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, discount information, image link, and other related attributes. During the normal operation of the platform, the database administrator might create redundant indexes to improve query performance. For example, they may create indexes on frequently used search columns such as name, brand, and category. However, if excessive redundant indexes are created without proper evaluation or consideration of their impact, it can lead to performance degradation. The redundant indexes consume extra storage space and can introduce overhead during data modification operations. Moreover, these redundant indexes may cause fragmentation and inefficiency in query execution.In this scenario, the 'REDUNDANT_INDEX' anomaly is simulated by passing the parameters '--anomaly REDUNDANT_INDEX' to the script. Additionally, the script is run with 7 concurrent threads, and the 'ProductCatalog' table has 90 columns, each with a size of 87 characters. The table contains a total of 440,562 rows of data. This scenario aims to replicate the negative impact of redundant index creation on the e-commerce platform database, emphasizing the potentially harmful consequences of inefficient index management on overall database performance.\n" + }, + "437": { + "start_time": "1697360612", + "end_time": "1697360703", + "start_timestamp": "2023-10-15 17:03:32", + "end_timestamp": "2023-10-15 17:05:03", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are uploading, downloading, or editing files simultaneously. The system is facing I/O contention, resulting in slower file transfer speeds.\n", + "desc": "In a file sharing system scenario, there is a database called 'FileShareDB' that is used by teams or organizations to share files. This database stores both the files themselves and metadata related to the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. On a typical day, multiple users may be simultaneously uploading, downloading, or editing files. This could involve team members collaborating on a project, uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system supports the storage and sharing of large files, such as presentations, video conference recordings, or design drawings. However, due to the high concurrency in file operations, the database may face challenges related to input/output (I/O) contention. When multiple users are simultaneously uploading or downloading large files, it can significantly strain the storage and network bandwidth of the system. This I/O contention can result in slower file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing capabilities. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can impact database performance. During peak periods, the database may encounter locking and transaction management issues, further slowing down file processing and metadata recording.\n" + }, + "438": { + "start_time": "1697360763", + "end_time": "1697360823", + "start_timestamp": "2023-10-15 17:06:03", + "end_timestamp": "2023-10-15 17:07:03", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database system used by a financial organization, a poor join performance is simulated wherein multiple users attempt to join large tables without proper indexing or optimization. This process causes contention for CPU resources and severely affects system performance and response time.\n", + "desc": "In the business intelligence scenario, suppose there is a database called 'CorporateAnalyticsDB' used for analyzing various business data of a large corporation. This database consists of multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each containing a large number of rows and columns, recording detailed business information. The analysts in the company often need to execute complex join queries across these tables to generate comprehensive business reports. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze the purchasing behaviors of different customer groups. Furthermore, they might also need to link these data with the 'ProductCatalog' (product catalog) and 'MarketTrends' (market trends) tables to gain deeper market insights. However, due to the large size of these tables and the involvement of multi-table joins, the performance of these queries can become slow. If the join keys are not effectively indexed or if the queries are not optimized correctly, executing these join operations can consume a significant amount of time and resources. During peak periods, when multiple complex join queries are executed simultaneously, there might be competition for CPU resources, which further reduces query efficiency. This CPU contention might occur due to an excessive number of compute-intensive queries running on the database server or insufficient CPU resources on the server to handle these queries.\n" + }, + "439": { + "start_time": "1697360883", + "end_time": "1697361033", + "start_timestamp": "2023-10-15 17:08:03", + "end_timestamp": "2023-10-15 17:10:33", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an e-commerce platform, when retrieving a large amount of data that requires correlated subqueries, such as finding the inventory levels for each product, there could be a performance degradation if the subqueries are not optimized. This can be simulated using the script \"python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY\".\n", + "desc": "In an e-commerce platform, there is a database called 'ECommerceDB' that stores information about a variety of products. One table in this database, named 'ProductInventory', contains inventory information for different products. This table includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In order to query the inventory level of products, related subqueries are performed. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query involves selecting products of a certain category from the 'ProductDetails' table and then conducting subqueries on the 'ProductInventory' table to obtain inventory data for these products. However, when there are a large number of products, the performance of these related subqueries can be inefficient. This is because executing individual subqueries for each product to obtain inventory information would take a lot of time. As a result, the database might need to read a significant amount of data from the disk, which can lead to I/O bottlenecks.\n" + }, + "440": { + "start_time": "1697361093", + "end_time": "1697361165", + "start_timestamp": "2023-10-15 17:11:33", + "end_timestamp": "2023-10-15 17:12:45", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 86\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 15\n \n # Number of rows to insert\n num_rows = 60\n \n # Size of each column (in characters)\n column_size = 22\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application where 86 concurrent users are inserting a large amount of data into a database table with 15 columns and 60 rows, each column having a size of 22 characters, simulate the database exception caused by this process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' used for collecting and analyzing data from various sensors. This database is designed to handle a large volume of sensor data and contains a key table named 'SensorReadings'. This table stores data from 60 sensors, with each row representing a reading from a specific sensor. The table has 15 columns, including sensor ID, reading category, value, timestamp, location, and status. The size of each column is limited to 22 characters. The sensors continuously transmit data to the database, and at a specific moment, 86 sensors start transmitting data simultaneously. Due to the high volume of data being inserted into the 'SensorReadings' table, the database might encounter performance issues. This can be caused by factors such as insufficient buffering mechanisms, lack of appropriate partitioning, or absence of necessary indexes. As a result, the database's ability to process these large-scale insertions efficiently could be impacted, leading to anomalies in the database. These anomalies may result in increased write latency, delayed data processing, or even database locking, affecting the overall functionality and performance of the IoT system.\n" + }, + "441": { + "start_time": "1697361225", + "end_time": "1697361296", + "start_timestamp": "2023-10-15 17:13:45", + "end_timestamp": "2023-10-15 17:14:56", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 86\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 35\n \n # Number of rows to insert\n num_rows = 92\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data collection system, 86 sensors are generating a large amount of data simultaneously. This data needs to be inserted into the database. Each data entry contains 35 columns, with each column having a maximum size of 54 characters. There are a total of 92 entries to be inserted. This process might simulate a database exception caused by the high volume of data being inserted at once.\n", + "desc": "In a business intelligence scenario, there is a database named 'BusinessAnalyticsDB' that stores and analyzes various business data. Within this database, there is a key table called 'BusinessData' that records detailed information about business transactions and activities. This table contains 92 rows of data, each representing a specific business record, with a total of 35 columns, each containing information up to 54 characters. These columns may include transaction ID, transaction type, transaction amount, date and time, customer information, product information, sales region, and other relevant details.Suppose, in a specific situation, 86 users simultaneously attempt to insert a large amount of new data into the 'BusinessData' table. This could be due to various factors such as a new marketing campaign, acquisitions, or bulk data imports. However, if the database is not effectively optimized to handle such high-volume insertions, it could lead to performance issues. The lack of proper indexing, buffering mechanisms, or partitioning strategies might cause slower insertion speeds and database contention. As a result, the overall performance of the database could be compromised, affecting other concurrent operations and potentially leading to anomalies. It is important for the database administrator to optimize the database structure, implement efficient indexing techniques, and consider batch insertions or parallel processing to ensure smooth and efficient data insertion operations.\n" + }, + "442": { + "start_time": "1697361356", + "end_time": "1697361416", + "start_timestamp": "2023-10-15 17:15:56", + "end_timestamp": "2023-10-15 17:16:56", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 107\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 92\n \n # Number of rows to insert\n num_rows = 325\n \n # Size of each column (in characters)\n column_size = 66\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for an online platform, 107 users simultaneously attempt to perform a frequent update operation in a database table containing 92 columns and 325 rows of data, with each column having a size of 66 characters. These users compete with each other to lock the database table and perform the update operation, potentially causing a database exception.\n", + "desc": "In the scenario of a banking database, there is a database called 'BankingDB' that stores customer and transaction data for a bank. Within this database, there is a primary table called 'AccountTransactions' that records detailed information about various banking transactions. This table contains 325 rows of data, with each row representing a transaction record for an account. The table consists of 92 columns, each containing information of up to 66 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more.In this specific situation, 107 users simultaneously attempt frequent update operations on the 'AccountTransactions' table. These operations involve updating transaction statuses, modifying transaction amounts, or adding transaction notes. Due to the large number of users and the concurrent nature of their operations, there might be a contention for locking the database table. This contention could lead to performance issues in the database. During peak hours, if such locking persists, it could result in delayed processing or failure of other users' transaction requests, thereby impacting the daily operations of the bank. Additionally, if such incidents occur frequently, they could also cause the database transaction log to grow rapidly, consuming excessive storage space, and might even cause temporary interruptions in database services.\n" + }, + "443": { + "start_time": "1697361476", + "end_time": "1697361539", + "start_timestamp": "2023-10-15 17:17:56", + "end_timestamp": "2023-10-15 17:18:59", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 135\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 15\n \n # Number of rows to insert\n num_rows = 2954088\n \n # Size of each column (in characters)\n column_size = 54\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online marketplace, 135 users simultaneously perform a search operation after a large-scale data cleaning process on a database table containing 15 columns and 2,954,088 rows of product records. Each column has a size of 54 characters. This may result in an exception in the database due to the increased workload and cleaning process.\n", + "desc": "In a real-life scenario, let's consider an e-commerce platform that has a database called 'OnlineStoreDB' for storing and managing product information. Within this database, there is a table named 'ProductRecords', which contains data for a large number of products. Each row in this table represents a unique product entry, and there are a total of 2,954,088 rows. The 'ProductRecords' table has 15 columns, each with a maximum size of 54 characters. These columns store various details about the products, such as product ID, name, price, quantity, description, brand, category, size, color, weight, production and expiration dates, supplier information, country, and rating.In this specific case, the anomaly being triggered is related to the vacuum operation. The database administrator needs to perform a vacuum operation as part of routine maintenance tasks or to optimize the performance of the database. Vacuuming involves cleaning up the unused or redundant space in the database and reclaiming it for future use. This process can help improve overall database performance and storage efficiency.However, the provided command specifies certain parameters, such as running the vacuum operation with 135 threads, which indicates a high level of parallelism. Additionally, the data size is significant, with millions of rows and 15 columns, each with a sizable size. This means that the vacuum operation is being performed on a large-scale database with a considerable amount of data.Performing a vacuum operation on such a large and complex database can have various impacts and potential challenges. It might require significant server resources, such as CPU and memory, to efficiently process the operation with the specified parallelism. Additionally, if the vacuum operation is not carefully managed, it could cause contention for system resources, affecting the performance of other database operations. Therefore, it is crucial to ensure that the vacuum operation is executed during periods of low database usage or to have appropriate strategies for resource allocation and workload management in place.\n" + }, + "444": { + "start_time": "1697361599", + "end_time": "1697361713", + "start_timestamp": "2023-10-15 17:19:59", + "end_timestamp": "2023-10-15 17:21:53", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 8\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 69\n \n # Number of rows to insert\n num_rows = 659977\n \n # Size of each column (in characters)\n column_size = 63\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database system for an online marketplace, multiple users are performing queries on a table with 69 columns and 659,977 rows of data. Each column has a size of 63 characters. However, there are redundant indexes created for attributes such as product name, category, and price range. These indexes have an additional storage footprint and cause performance overhead during the query process.\n", + "desc": "In a business intelligence scenario, imagine a database named 'BusinessDataDB', which is used to store and analyze various types of business data. This database includes a key table named 'BusinessRecords', which contains detailed information about different business transactions. This table consists of 659,977 rows of data, each representing a business transaction record, with a total of 69 columns, each containing information of up to 63 characters. These columns may include transaction ID, transaction type, amount, date, customer information, product information, salesperson ID, department, location, and more.In this scenario, the database administrator wants to improve the performance of complex queries performed by business analysts. To achieve this, the administrator decides to create redundant indexes on specific columns used in frequently executed queries. These indexes could include columns such as transaction type, customer information, or product information.By creating redundant indexes, the administrator aims to accelerate the execution of these complex queries, allowing analysts to obtain business insights more efficiently. However, it is important to note that this approach might increase storage usage and overhead in the database, as well as potentially cause database fragmentation. Therefore, proper monitoring and optimization measures need to be implemented to ensure the overall performance and stability of the business intelligence system.\n" + }, + "445": { + "start_time": "1697361774", + "end_time": "1697361864", + "start_timestamp": "2023-10-15 17:22:54", + "end_timestamp": "2023-10-15 17:24:24", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system, multiple users are simultaneously uploading, downloading, or editing files, causing a competition for input/output operations. This results in slower file transfers.\n", + "desc": "In a file sharing system, such as 'TeamFileShareDB' used by teams or organizations, multiple users are simultaneously uploading, downloading, and editing files. This system not only stores the actual files but also records metadata about them, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. This high concurrency in file operations can result in I/O (input/output) contention. When multiple users upload or download large files at the same time, it may strain the system's storage and network bandwidth. This contention can slow down file transfer speeds, especially in situations where there is limited bandwidth or insufficient server processing power. Additionally, frequent write operations in the database, such as file uploads and updates to metadata, can impact the overall performance of the database. During peak periods, the database might experience locking and transaction management issues, which further slow down file processing and recording of metadata.\n" + }, + "446": { + "start_time": "1697361924", + "end_time": "1697361985", + "start_timestamp": "2023-10-15 17:25:24", + "end_timestamp": "2023-10-15 17:26:25", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a database used by an online marketplace, there is a performance issue when joining multiple tables due to poor join performance. This, combined with high CPU contention, leads to slower processing and response times.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'CorporateAnalyticsDB' used for analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends', each filled with a large number of rows and columns, recording detailed business information. In this particular scenario, the performance of join queries across these tables is poor. These join queries are used by analysts to generate comprehensive business reports that involve linking data from multiple tables. For example, they might need to join the 'SalesData' table (containing sales records) with the 'CustomerProfiles' table (containing customer information) to analyze purchasing behaviors. They might also need to incorporate data from the 'ProductCatalog' (product catalog) and 'MarketTrends' (market trends) to gain deeper market insights. However, due to ineffective indexing of join keys or poor query optimization, executing these join operations can be very slow. This lack of performance can cause delays and inefficiencies when generating the desired reports. Additionally, during peak periods when multiple complex join queries are executed simultaneously, there can be competition for CPU resources. This could further reduce the efficiency of these queries, as the server's CPU resources may be insufficient to handle the workload.\n" + }, + "447": { + "start_time": "1697362045", + "end_time": "1697362195", + "start_timestamp": "2023-10-15 17:27:25", + "end_timestamp": "2023-10-15 17:29:55", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n", + "description": "In an online marketplace, if there is a need to fetch a large amount of data from the database, such as inventory information for each product, and the query involves executing correlated subqueries, it may cause a performance issue. Therefore, using the script \"anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY\" would simulate this scenario and trigger the corresponding exception in the database.\n", + "desc": "In the database of an e-commerce platform, there is a database named 'ECommerceDB', which includes a crucial table named 'ProductInventory' for recording the inventory information of various products. This table might contain inventory data for tens of thousands or even hundreds of thousands of products. The inventory information for each product includes the product ID, current stock level, last inventory update time, supplier ID, warehouse location, and other details. In this database, querying the inventory level of each product may require performing related subqueries. For example, a common query might be to determine the total current inventory of all products within a specific category. This type of query might first involve selecting all products of a particular category from the 'ProductDetails' table, then conducting subqueries on the 'ProductInventory' table to obtain the inventory data for these products. When the number of products is very large, the performance of these related subqueries can become inefficient. For instance, if a category includes thousands of products, executing individual subqueries for each product to obtain inventory information would be time-consuming. In such cases, due to the need to retrieve inventory information for a large number of products, the database might need to read a significant amount of data from the disk, which could lead to I/O bottlenecks.\n" + }, + "448": { + "start_time": "1697362255", + "end_time": "1697362327", + "start_timestamp": "2023-10-15 17:30:55", + "end_timestamp": "2023-10-15 17:32:07", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 104\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 19\n \n # Number of rows to insert\n num_rows = 68\n \n # Size of each column (in characters)\n column_size = 21\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a real-life scenario, this script simulates a situation where a system is trying to insert a large amount of data into a database. The script is triggered by running the command \"python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA --threads 104 --ncolumn 19 --colsize 21 --nrow 68\". The system is inserting data from 104 sensors into the database table, which has 19 columns and 68 rows. Each column can hold up to 21 characters. The purpose of this simulation is to test the system's ability to handle the simultaneous insertion of a large amount of data and to detect any exceptions that may occur during this process.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' that is used for storing and analyzing sensor data from various devices. This database is designed to handle a large volume of data from sensors placed in different environments. The main table in the database is 'SensorReadings' which contains information about readings from 68 sensors. Each row in the table represents a reading from a specific sensor and contains 19 columns including sensor ID, sensor type, reading value, timestamp, location, and other related data. In this scenario, there is a large influx of data from the sensors, with 104 devices actively transmitting readings to the database simultaneously. However, due to the high number of concurrent write requests and the lack of efficient data insertion mechanisms or indexing strategies, the database may encounter difficulties in processing all these write requests. This can lead to performance issues such as increased write latency and potentially even data loss. The inefficiency in handling the large amount of incoming data can create anomalies in the database, affecting the overall functionality and integrity of the IoT data processing system. It is important to optimize the database design, implement efficient data insertion techniques, and ensure proper indexing to prevent these anomalies and ensure smooth operation of the IoT data processing system.\n" + }, + "449": { + "start_time": "1697362387", + "end_time": "1697362458", + "start_timestamp": "2023-10-15 17:33:07", + "end_timestamp": "2023-10-15 17:34:18", + "alerts": [], + "labels": [ + "highly concurrent commits or highly concurrent inserts" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n\n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef insert_large_data(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n\n print_time()\n #Delete undeleted tables\n delete_table(table_name)\n #create a new table\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n #insert the data\n #insert_definitions = ', '.join(f'repeat(round(random()*999)::text,{(colsize//3)})' for i in range(ncolumns))\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.concurrent_execute_sql(threads,duration,insert_data,commit_interval=1)\n\n #delete the table\n delete_table(table_name)\n \n #print the end time\n print_time()\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 104\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 25\n \n # Number of rows to insert\n num_rows = 65\n \n # Size of each column (in characters)\n column_size = 61\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n insert_large_data(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data-intensive application, 104 sensors are generating a large amount of data that needs to be inserted into a database. Each data record contains 25 columns, with each column having a size of 61 characters. There are a total of 65 data records to be inserted. This simulates the scenario where a high number of sensor inputs overwhelm the database insertion process, potentially causing exceptions or delays in data processing.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database called 'IoTDataDB' that is used to store sensor data from various devices. This database is designed to handle a large volume of data and has a key table called 'SensorReadings' that records detailed information about the sensor readings. The table contains 65 rows of data, each representing a reading from a different sensor. There are a total of 25 columns, each containing information up to 61 characters, including the sensor ID, reading type, reading value, timestamp, sensor location, and status information. When adding a large amount of sensor data from 104 different devices simultaneously, the database might encounter performance issues. This can be due to factors such as insufficient buffering mechanisms, lack of proper data partitioning, or inadequate indexing. These issues can lead to increased write latency, database locking, and anomalies in the database. Such anomalies can result in delays or failures in storing the sensor data, which can impact the overall efficiency and reliability of the IoT system.\n" + }, + "450": { + "start_time": "1697362518", + "end_time": "1697362578", + "start_timestamp": "2023-10-15 17:35:18", + "end_timestamp": "2023-10-15 17:36:18", + "alerts": [], + "labels": [ + "highly concurrent updates" + ], + "command": "python anomaly_trigger/main.py --anomaly LOCK_CONTENTION", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef lock_contention(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n #delete the table\n delete_table(table_name)\n print_time()\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 78\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 50\n \n # Number of rows to insert\n num_rows = 259\n \n # Size of each column (in characters)\n column_size = 86\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n lock_contention(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for managing a large inventory, 78 users simultaneously try to update a table that contains 50 columns and 259 rows of product records. Each column has a size of 86 characters. These users compete with each other to lock the database table for the update operation. This simulation aims to trigger a database exception caused by the contention for locking the table.\n", + "desc": "In the database of a banking system, there is a table named 'TransactionRecords' that stores information about various banking transactions. This table contains 259 rows of data, each representing a transaction record, with a total of 50 columns, each column containing information up to 86 characters. These columns may include transaction ID, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, account information, counterparty details, and transaction status. During peak hours, when 78 users simultaneously perform frequent update operations on the 'TransactionRecords' table, such as updating transaction statuses or modifying transaction amounts, there is a possibility of locking contention. This means that multiple users may compete for locking the same or adjacent rows in the table. If this locking contention lasts for a significant amount of time, it can affect the performance of the database and lead to delayed processing or failure of other users' transaction requests. This can ultimately impact the daily operations of the bank and cause interruptions in database services.\n" + }, + "451": { + "start_time": "1697362638", + "end_time": "1697362679", + "start_timestamp": "2023-10-15 17:37:18", + "end_timestamp": "2023-10-15 17:37:59", + "alerts": [], + "labels": [ + "highly deletes" + ], + "command": "python anomaly_trigger/main.py --anomaly VACUUM", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef vacuum(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n db=Database(init())\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be deleted\n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n # delete 80% of the rows\n delete_nrows=int(nrows*0.8)\n vacuum=f'delete from {table_name} where id < {delete_nrows};'\n db.execute_sqls(vacuum)\n\n # do the select , then the vacuum occurs\n select='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,select,nrows)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 77\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 19\n \n # Number of rows to insert\n num_rows = 3888305\n \n # Size of each column (in characters)\n column_size = 61\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n vacuum(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online store, during a period of heavy usage, 77 users simultaneously perform searches on a database table containing 19 columns and 3,888,305 rows of product records, each with a column size of 61 characters. This process occurs after a large-scale data cleaning operation, which may result in a database exception.\n", + "desc": "In an online store's e-commerce database, there is a table called 'ProductRecords' that stores detailed information about products. This table contains a massive amount of data, with 3,888,305 rows, each representing an individual product, and a total of 19 columns, each with a size of 61 characters. These columns include product ID, name, price, stock quantity, description, brand, category, size, color, weight, production date, expiration date, supplier information, country, rating, number of reviews, sales status, promotional information, image link, and other relevant attributes. In this scenario, the database administrator needs to perform a VACUUM operation, which involves reclaiming space occupied by deleted or outdated data in order to optimize database performance. This operation is crucial for maintaining database performance and preventing storage space wastage. However, due to the large scale of the 'ProductRecords' table, executing the VACUUM operation with 77 threads can create a significant impact on the database. If not properly managed, this operation can cause performance anomalies, such as increased CPU usage, slow query performance, or even temporary interruptions in database services. To mitigate these potential anomalies, the administrator should carefully schedule the VACUUM operation during low traffic periods and consider implementing optimization techniques, such as incremental processing or dividing the operation into smaller batches.\n" + }, + "452": { + "start_time": "1697362739", + "end_time": "1697362854", + "start_timestamp": "2023-10-15 17:38:59", + "end_timestamp": "2023-10-15 17:40:54", + "alerts": [], + "labels": [ + "too many indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly REDUNDANT_INDEX", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n \n def build_index(self, table_name, idx_num):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n \n for i in range(0, idx_num):\n the_sql = 'CREATE INDEX index_' + table_name + '_' + str(i) + ' ON ' + table_name + '(name' + str(i) + ');'\n print(the_sql)\n cursor.execute(the_sql)\n\n \n self.conn.commit()\n self.conn.close()\n return\n\n\n \n def drop_index(self,table_name):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n cursor.execute(\"select indexname from pg_indexes where tablename='\"+table_name+\"';\")\n idxs = cursor.fetchall()\n for idx in idxs:\n the_sql = 'DROP INDEX ' + idx[0] + ';'\n cursor.execute(the_sql)\n print(the_sql)\n self.conn.commit()\n self.conn.close()\n return\n\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\ndef redundent_index(threads,duration,ncolumns,nrows,colsize,nindex,table_name='table1'):\n #create a new table\n print_time()\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n db=Database(init())\n # insert some data to be updated \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();' \n db.execute_sqls(insert_data) \n\n #initialization of the indexes\n nindex=int((nindex*ncolumns)/10)\n db.build_index(table_name,nindex)\n id_index='CREATE INDEX index_'+table_name+'_id ON '+table_name+'(id);'\n db.execute_sqls(id_index)\n\n #lock_contention\n pool = Pool(threads)\n for _ in range(threads):\n pool.apply_async(\n lock, (table_name, ncolumns, colsize, duration, nrows))\n pool.close()\n pool.join()\n\n #drop the index\n db.drop_index(table_name)\n\n #delete the table\n delete_table(table_name)\n print_time()\n\n\ndef lock(table_name, ncolumns, colsize, duration, nrows):\n args=init()\n start = time.time()\n #lock_contention\n while time.time()-start < duration:\n conn = psycopg2.connect(database=args.dbname, user=args.user, password=args.password,\n host=args.host, port=args.port)\n cur = conn.cursor()\n while time.time()-start < duration:\n col_name = random.randint(0, ncolumns-1)\n row_name = random.randint(1, nrows-1)\n lock_contention = f'update {table_name} set name{col_name}=(SELECT substr(md5(random()::text), 1, {colsize})) where id ={row_name}'\n #db.concurrent_execute_sql(threads,duration,lock_contention,nrows)\n cur.execute(lock_contention)\n conn.commit()\n conn.commit()\n conn.close()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 9\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 100\n \n # Number of rows to insert\n num_rows = 925504\n \n # Size of each column (in characters)\n column_size = 77\n \n # Table name\n table_name = 'table1'\n \n nindex=6\n \n # Call the insert_large_data function\n redundent_index(num_threads, insert_duration, num_columns, num_rows, column_size, nindex,table_name)\n", + "description": "In a database for a large online marketplace, a redundant index scenario is being simulated. This scenario involves running a Python script called \"main.py\" with the specified anomaly \"REDUNDANT_INDEX\", along with the parameters \"--threads 9\", \"--ncolumn 100\", \"--colsize 77\", and \"--nrow 925504\". This means that there will be 9 users performing operations on a database table with 100 columns and 925,504 rows, where each column can hold up to 77 characters. The purpose of this simulation is to assess the potential impact of having redundant indexes on the database's storage footprint and performance.\n", + "desc": "In a business intelligence scenario, particularly involving the financial data of a large corporation, there is a database named 'CorporateFinanceDB'. This database is designed to store and process financial records and transactions. It contains multiple tables, one of which is the 'FinancialRecords' table. This table consists of 925,504 rows of data, each representing a financial record, and has 100 columns with each column containing information of up to 77 characters. These columns might include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and so on. In this scenario, there is a need for query acceleration for complex financial analysis operations. To meet this need, the database administrator might create redundant indexes on specific columns or combinations of columns before executing the queries. However, if the number of indexes created is excessive or if these indexes are not effectively managed and maintained, they can consume additional storage space, cause performance overhead, and may even result in database fragmentation. Thus, this scenario involves the potential for redundant index creation and its impact on database performance and efficiency.\n" + }, + "453": { + "start_time": "1697362914", + "end_time": "1697363005", + "start_timestamp": "2023-10-15 17:41:54", + "end_timestamp": "2023-10-15 17:43:25", + "alerts": [], + "labels": [ + "INSERT_LARGE_DATA", + "IO_CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly INSERT_LARGE_DATA,IO_CONTENTION", + "script": "import os\nimport datetime\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\nif __name__ == \"__main__\":\n print_time()\n command = (\n \"su - root -c 'cd /sysbench-tpcc-master; \"\n \"./tpcc.lua --db-driver=pgsql --tables=2 --scale=3 --threads=50 --events=0 \"\n \"--pgsql-host=xxxx --pgsql-user=xxxx --pgsql-password=xxxx \"\n \"--pgsql-port=5432 --pgsql-db=tpcc --time=90 --rand-type=uniform --report-interval=10 run'\"\n )\n\n os.system(command)\n print_time()\n", + "description": "In a file sharing system where multiple users upload, download, or edit files at the same time, the system experiences I/O contention. This leads to a slowdown in file transfer operations.\n", + "desc": "In a file sharing system, there is a database called 'TeamFileShareDB', which is used by teams or organizations to share files. This database stores both the files themselves and their metadata, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During a typical workday, multiple users are actively uploading, downloading, and editing files. For example, a project team collaborates on an important report, constantly uploading the latest file versions for others to download and review. Additionally, this system handles the storage and sharing of large files, such as presentations, video conference recordings, and design drawings. However, due to the high concurrency of file operations, the database 'TeamFileShareDB' faces challenges related to I/O (input/output) contention. When multiple users simultaneously upload or download large files, the system's storage and network bandwidth can become strained. This I/O contention results in slower file transfer speeds, especially when there are limitations on bandwidth or insufficient processing capabilities on the server side. Furthermore, frequent write operations in the database, such as file uploads and metadata updates, can also impact database performance. During peak usage periods, the database may encounter issues with locking and transaction management, further slowing down file processing and metadata recording.\n" + }, + "454": { + "start_time": "1697363065", + "end_time": "1697363125", + "start_timestamp": "2023-10-15 17:44:25", + "end_timestamp": "2023-10-15 17:45:25", + "alerts": [], + "labels": [ + "POOR JOIN PERFORMANCE", + "CPU CONTENTION" + ], + "command": "python anomaly_trigger/main.py --anomaly POOR_JOIN_PERFORMANCE,CPU_CONTENTION", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_job_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='imdbload',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '1a.sql', '1b.sql', '1c.sql', '1d.sql',\n '2a.sql', '2b.sql', '2c.sql', '2d.sql',\n '3a.sql', '3b.sql', '3c.sql',\n '4a.sql', '4b.sql', '4c.sql',\n '5a.sql', '5b.sql', '5c.sql',\n '6a.sql', '6b.sql', '6c.sql', '6d.sql', '6e.sql', '6f.sql',\n '7a.sql', '7b.sql', '7c.sql',\n '8a.sql', '8b.sql', '8c.sql', '8d.sql',\n '9a.sql', '9b.sql', '9c.sql', '9d.sql',\n '10a.sql', '10b.sql', '10c.sql',\n '11a.sql', '11b.sql', '11c.sql', '11d.sql',\n '12a.sql', '12b.sql', '12c.sql',\n '13a.sql', '13b.sql', '13c.sql', '13d.sql',\n '14a.sql', '14b.sql', '14c.sql',\n '15a.sql', '15b.sql', '15c.sql', '15d.sql',\n '16a.sql', '16b.sql', '16c.sql', '16d.sql',\n '17a.sql', '17b.sql', '17c.sql', '17d.sql', '17e.sql', '17f.sql',\n '18a.sql', '18b.sql', '18c.sql',\n '19a.sql', '19b.sql', '19c.sql', '19d.sql',\n '20a.sql', '20b.sql', '20c.sql',\n '21a.sql', '21b.sql', '21c.sql',\n '22a.sql', '22b.sql', '22c.sql', '22d.sql',\n '23a.sql', '23b.sql', '23c.sql',\n '24a.sql', '24b.sql',\n '25a.sql', '25b.sql', '25c.sql',\n '26a.sql', '26b.sql', '26c.sql',\n '27a.sql', '27b.sql', '27c.sql',\n '28a.sql', '28b.sql', '28c.sql',\n '29a.sql', '29b.sql', '29c.sql',\n '30a.sql', '30b.sql', '30c.sql',\n '31a.sql', '31b.sql', '31c.sql',\n '32a.sql', '32b.sql',\n '33a.sql', '33b.sql', '33c.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files)[:-10]:\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/join-order-benchmark-master/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1a.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n)\n", + "description": "In a data analytics platform, multiple users are performing join operations on a database table containing a large amount of data. These join operations require a significant amount of CPU resources, leading to contention. As a result, the join performance deteriorates, causing delays in data processing and analysis.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'CorporateAnalyticsDB' used for storing and analyzing various business data of a large corporation. This database contains multiple complex tables such as 'SalesData', 'CustomerProfiles', 'ProductCatalog', and 'MarketTrends'. These tables store detailed business information used for generating comprehensive business reports. However, due to the large size of these tables and the involvement of multi-table joins, the performance of the join queries might be slow. This could be caused by ineffective indexing of the join keys or suboptimal query optimization. Executing these join operations can consume a significant amount of time and resources. Additionally, during peak periods, when multiple complex join queries are executed simultaneously, there may be competition for CPU resources. This competition for CPU resources can reduce the efficiency of these queries further. CPU contention might occur due to too many compute-intensive queries running on the database server or insufficient CPU resources on the server.\n" + }, + "455": { + "start_time": "1697363185", + "end_time": "1697363185", + "start_timestamp": "2023-10-15 17:46:25", + "end_timestamp": "2023-10-15 17:46:25", + "alerts": [], + "labels": [ + "FETCH_LARGE_DATA", + "CORRELATED SUBQUERY" + ], + "command": "python anomaly_trigger/main.py --anomaly FETCH_LARGE_DATA,CORRELATED_SUBQUERY", + "script": "import os\nimport re\nimport time\n\nimport psycopg2\n\n\nREPEATCOUNT = 1\nTIMELOGPATH = str(int(time.time())) + \"_tpch_trigger_time_log.txt\"\nTIMELOG = open(TIMELOGPATH, 'w+')\n\n\nclass Database():\n\n def __init__(self):\n self.conn = None\n self.conn = psycopg2.connect(database='tpch',\n user='xxxx',\n password='xxxx',\n host='xxxx',\n port=5432)\n\n def execute_sql(self, sql):\n fail = 1\n cur = self.conn.cursor()\n i = 0\n cnt = 3\n while fail == 1 and i < cnt:\n try:\n fail = 0\n cur.execute(sql)\n except BaseException as error:\n fail = 1\n print(error)\n res = []\n if fail == 0:\n res = cur.fetchall()\n i = i + 1\n if fail == 1:\n # print(\"SQL Execution Fatal!!\", sql)\n return 0, ''\n elif fail == 0:\n return 1, res\n\n\ndef all_sql_files():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n # all_file_list = list(filter(file_filter, os.listdir(res_path)))\n # all_file_list = sorted(all_file_list, key=custom_sort)\n all_file_list = [\n '4.explain.sql']\n\n print(all_file_list)\n files_list = []\n for file in all_file_list:\n files_list.append(res_path + file)\n return files_list\n\n\ndef custom_sort(item):\n # \u63d0\u53d6\u6570\u5b57\u548c\u5b57\u6bcd\u90e8\u5206\n match = re.match(r'(\\d+)(\\D+)', item)\n # \u5c06\u6570\u5b57\u90e8\u5206\u8f6c\u6362\u4e3a\u6574\u6570\u4ee5\u8fdb\u884c\u6bd4\u8f83\n num_part = int(match.group(1))\n # \u8fd4\u56de\u5143\u7ec4\u4ee5\u6309\u6570\u5b57\u548c\u5b57\u6bcd\u6392\u5e8f\n return (num_part, match.group(2))\n\n\ndef file_filter(f):\n if f[-4:] == '.sql' and 'schema' not in f and 'fkindexes' not in f:\n return True\n else:\n return False\n\n\ndef get_sql_from_file(file_name):\n file = open(file_name)\n lines = file.readlines().copy()\n sql = ''\n for line in lines:\n sql += line\n sql = sql.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')\n file.close()\n return sql\n\n\ndef test_hint_from_file(sql_file):\n db = Database()\n sql = get_sql_from_file(sql_file)\n success, result_cont = db.execute_sql(sql)\n print(success, result_cont)\n\n\ndef test_all():\n sql_files = all_sql_files()\n\n for sql_file in list(sql_files):\n if sql_file:\n test_hint_from_file(sql_file)\n\n\ndef test_one():\n res_path = \"{}/tpch-queries/\".format(\n os.path.dirname(os.path.abspath(__file__)))\n test_hint_from_file(res_path + '1.explain.sql')\n\n\nif __name__ == '__main__':\n for i in range(0, REPEATCOUNT):\n TIMELOG.write(str(int(time.time()))+\";\")\n test_all()\n TIMELOG.write(str(int(time.time()))+\"\n\")\n TIMELOG.flush()\n\n TIMELOG.close()\n\n\n\n", + "description": "In an online store's database, when trying to retrieve a large amount of data related to products and perform correlated subqueries, there may be a performance issue. This can occur when querying the inventory of each product, especially if the subqueries are not optimized properly.\n", + "desc": "In a scenario related to an e-commerce platform, let's imagine a database specifically designed for this platform, called 'ECommerceDB'. Within this database, there is a table known as 'ProductDetails' that stores detailed information about various products offered by the platform. This table consists of tens of thousands or even hundreds of thousands of rows of data, each representing a unique product entry. The columns within this table include information such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and the product's availability status.In order to perform certain queries on the database, users might need to utilize correlated subqueries. For example, a common query might involve determining the total available stock of all products within a specific category. To obtain this information, subqueries are executed on the 'ProductDetails' table to retrieve the necessary data. However, when the number of products within a category is large, the performance of these correlated subqueries can be impacted. This is due to the need to retrieve inventory information for a significant number of products, leading to lengthy query execution times. As a result, the database might need to read a substantial amount of data from the disk, resulting in potential I/O bottlenecks.\n" + }, + "456": { + "start_time": "1697440521", + "end_time": "1697440591", + "start_timestamp": "2023-10-16 15:15:21", + "end_timestamp": "2023-10-16 15:16:31", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online shopping platform, there are 5 users searching in a database table containing 5 columns, 2,000,000 rows, each column size of 50 characters. However, the search lacks the necessary index, which may result in an exception in the database.\n", + "desc": "In an e-commerce scenario, there is an e-commerce platform called 'EcommercePlatform' that uses a database to store information about various products. The database consists of multiple tables, including a key table named 'ProductDetails', which contains detailed information about each product. Suppose this table contains 2,000,000 rows of data, with each row representing a different product. The table has a total of 5 columns, including product ID, name, price, category, and description, with each column storing information of up to 50 characters. In this scenario, there is an issue with missing indexes on the 'ProductDetails' table, particularly on commonly used search columns such as name, category, and description. This lack of indexes makes it inefficient for the database to handle concurrent search requests. As a result, when 5 users simultaneously search for products on the platform, it can lead to slower search performance and potentially impact user experience. Therefore, adding necessary indexes to the 'ProductDetails' table is crucial to improve search efficiency and provide a smooth user experience on the e-commerce platform.\n" + }, + "457": { + "start_time": "1697440653", + "end_time": "1697440724", + "start_timestamp": "2023-10-16 15:17:33", + "end_timestamp": "2023-10-16 15:18:44", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analysis scenario, there is a database with 5 columns and 2,000,000 rows. Each column has a size of 50 characters. However, there is a missing index in the database. This can result in an exception when 5 users search the database simultaneously.\n", + "desc": "In a business intelligence scenario, imagine there is a database named 'FinancialAnalyticsDB', which is designed to store and analyze financial data for a large organization. This database contains a table called 'FinancialTransactions', which records detailed information about various financial transactions, such as transaction ID, transaction type, amount, date, transaction status, and more. In this specific situation, the database is facing an issue of missing indexes. The database administrator needs to create indexes on the 'FinancialTransactions' table to improve the performance of queries involving transaction type, date, and other relevant columns. Currently, there are 2,000,000 rows of data in the table, with each row having 5 columns, each capable of storing up to 50 characters of information. The administrator plans to use 5 threads to create these indexes, aiming to optimize the database's ability to handle concurrent queries and improve overall search performance.In a randomly generated scenario, suppose we have a highly active e-commerce platform that utilizes a database for storing product information. This database, called 'ProductDB', includes a table named 'ProductDetails' that contains details about various products available for purchase. Within this table, there are 2,000,000 rows of data, each representing a unique product, with a total of 5 columns. These columns might include the product ID, name, price, stock quantity, and category. Each column can hold up to 50 characters of data. However, due to the lack of necessary indexes on commonly used search columns, such as name and category, the database struggles to efficiently handle concurrent search requests. As a result, users may experience slower search results or even failures when searching for products. Additionally, this lack of indexes could negatively impact the overall operational efficiency and reputation of the e-commerce platform.\n" + }, + "458": { + "start_time": "1697440784", + "end_time": "1697440854", + "start_timestamp": "2023-10-16 15:19:44", + "end_timestamp": "2023-10-16 15:20:54", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a social media platform, if there are 5 users searching in a database table containing 5 columns, 2,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an e-commerce scenario where a database named 'OnlineStoreDB' is used by an online store, there is a table called 'ProductCatalog' that stores information about various products. This table contains 2,000,000 rows of data, each representing a product, and has a total of 5 columns with each column storing information of up to 50 characters. The columns in this table may include product ID, name, price, stock quantity, and category. However, the 'ProductCatalog' table does not have appropriate indexes, specifically on commonly used search columns such as product name and category. As a result, when users search for products using keywords or specific categories, the database cannot efficiently process these search requests. This inefficiency not only leads to a poor user experience with slower search results and potential failures but can also impact the overall performance and reputation of the online store.\n" + }, + "459": { + "start_time": "1697440915", + "end_time": "1697440985", + "start_timestamp": "2023-10-16 15:21:55", + "end_timestamp": "2023-10-16 15:23:05", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a financial system, if there are 10 concurrent users searching in a table with 5 columns, 2,000,000 rows, each column containing 50 characters, and there is a lack of necessary indexes, an exception will occur in the database.\n", + "desc": "In the database of an e-commerce platform, there is a database used for managing product information called 'ProductDB'. This database stores data about various products offered by the platform. The main table in this database is called 'ProductDetails'. It contains 2,000,000 rows of data, each representing a product, and each row consists of 5 columns. These columns include product ID, name, price, stock quantity, and category. Each column can store up to 50 characters of information. However, due to the absence of necessary indexes on commonly searched columns, such as name and category, the database becomes inefficient in handling a large number of concurrent search queries. As a result, users may experience delays or failures in their search requests, negatively impacting their experience on the platform and potentially affecting the overall efficiency and reputation of the e-commerce platform.\n" + }, + "460": { + "start_time": "1697441046", + "end_time": "1697441117", + "start_timestamp": "2023-10-16 15:24:06", + "end_timestamp": "2023-10-16 15:25:17", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a social media platform, if there are 10 users simultaneously searching in a database table containing 5 columns, with each column size of 50 characters, and a total of 2,000,000 rows, but the necessary indexes for efficient searching are missing, it can result in a database exception.\n", + "desc": "In an e-commerce database, specifically a database for an online store named 'OnlineStoreDB', there is a table named 'ProductDetails' that contains detailed information about various products. This table holds 2,000,000 rows of data, with each row representing a unique product and consisting of 5 columns, each containing information of up to 50 characters. These columns may include product ID, name, price, stock quantity, and category. However, in this scenario, the table lacks necessary indexes, particularly on commonly used search columns such as name and category. As a result, when multiple users simultaneously search for products using the search interface, the database becomes inefficient in handling these concurrent search requests. This inefficiency negatively impacts user experience, leading to delays or failures in search results. Additionally, it can also affect the overall operational efficiency and reputation of the online store.\n" + }, + "461": { + "start_time": "1697441192", + "end_time": "1697441263", + "start_timestamp": "2023-10-16 15:26:32", + "end_timestamp": "2023-10-16 15:27:43", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a social media platform, if there are 5 users searching in a database table containing 5 columns, 4,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, it can result in a database exception.\n", + "desc": "In the life scenario of an e-commerce platform, there is a database used for storing and managing product information. This database, named 'ECommerceDB', contains a table called 'ProductDetails' that stores detailed information about various products. In this scenario, the table 'ProductDetails' has a large number of rows, specifically 4,000,000 rows, with each row representing a single product. The table consists of 5 columns, each column containing information up to 50 characters in size. These columns might include attributes such as product ID, name, price, stock quantity, and category. However, in this specific case, there is a problem with missing indexes on the 'ProductDetails' table. This means that there are no indexes created on important columns, such as name, price, or category, which are frequently used for searching products. As a result, when users search for products on the e-commerce platform, the database becomes inefficient in handling these search requests, resulting in slow query performance and potentially even search failures. These inefficiencies can negatively impact the user experience, as well as the overall efficiency and reputation of the e-commerce platform.\n" + }, + "462": { + "start_time": "1697441323", + "end_time": "1697441394", + "start_timestamp": "2023-10-16 15:28:43", + "end_timestamp": "2023-10-16 15:29:54", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial analysis system with 5 users, each user attempts to search through a database table containing 5 columns and 4 million rows of data. Each column has a size of 50 characters. However, the search operations are not optimized with the necessary indexes, causing a database exception.\n", + "desc": "In an e-commerce database called 'OnlineShopDB', there is a table named 'ProductDetails' that stores information about various products. This table contains 4,000,000 rows of data, each representing a product, with a total of 5 columns. These columns may include product ID, name, price, stock quantity, and category, each containing information of up to 50 characters. However, there is a lack of necessary indexes on commonly used search columns such as name and category. Because of this, the database becomes less efficient in handling search queries for products, especially when 5 users are concurrently searching for products. The lack of indexes may lead to delays or failures in search requests, affecting the overall user experience and operational efficiency of the e-commerce platform.\n" + }, + "463": { + "start_time": "1697441454", + "end_time": "1697441526", + "start_timestamp": "2023-10-16 15:30:54", + "end_timestamp": "2023-10-16 15:32:06", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online database used by a media streaming service, there are 10 users searching for data in a table with 5 columns and 4,000,000 rows. Each column has a size of 50 characters. However, there is no index present to optimize the search process, resulting in reduced performance and potential exceptions in the database.\n", + "desc": "In an e-commerce platform's database, there is a table named 'ProductDetails' designed to store information about various products. This table contains 4,000,000 rows of data, each representing a specific product. There are 5 columns in this table, including product ID, name, price, stock quantity, and category, with each column having a size of 50 characters. However, due to the lack of necessary indexes on commonly used search columns such as name, price, and category, the database's efficiency in handling concurrent search requests is compromised. This results in slower search queries and potentially affects the user experience and overall performance of the e-commerce platform.\n" + }, + "464": { + "start_time": "1697441586", + "end_time": "1697441658", + "start_timestamp": "2023-10-16 15:33:06", + "end_timestamp": "2023-10-16 15:34:18", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by a financial company, 10 users are simultaneously searching for data in a table with 5 columns and 4,000,000 rows. Each column has a size of 50 characters. However, the search is missing the necessary indexes, resulting in a potential exception in the database.\n", + "desc": "In an e-commerce platform database, called 'OnlineShopDB', there is a table named 'ProductList' that stores information about various products available for sale. This table contains 4,000,000 rows of data, each representing a different product, with a total of 5 columns. These columns include product ID, name, price, stock quantity, and category. Each column can store up to 50 characters. However, there is a lack of necessary indexes in the 'ProductList' table, particularly on commonly used search columns such as name and category. This absence of indexes causes the database to become inefficient when handling a large number of concurrent search requests. As a result, users may experience delays or failures when trying to search for products, and the overall operational efficiency and reputation of the e-commerce platform may be negatively affected.\n" + }, + "465": { + "start_time": "1697441731", + "end_time": "1697441802", + "start_timestamp": "2023-10-16 15:35:31", + "end_timestamp": "2023-10-16 15:36:42", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial data analysis system, five analysts simultaneously search for information in a database table that contains five columns of data. Each column has a size of 100 characters, and there are 2,000,000 rows of data. However, the search operation does not utilize the necessary indexes, leading to a database exception.\n", + "desc": "In an e-commerce setting, there is a database specifically designed for storing information about different products. This database is referred to as 'ProductDB'. It contains a table called 'ProductDetails' that consists of 2,000,000 rows of data, with each row representing a unique product. The table has 5 columns, each of which can store up to 100 characters. These columns include product ID, name, price, stock quantity, and brand. However, the database suffers from a lack of necessary indexes, particularly on commonly used search columns such as name and brand. This deficiency in indexes affects the efficiency of search queries, especially when multiple users simultaneously search for products using the platform's search interface. As a result, the database experiences performance issues and anomalies, leading to delays or failures in search results. This negatively impacts user experience and the overall operational efficiency of the e-commerce platform.\n" + }, + "466": { + "start_time": "1697441862", + "end_time": "1697441933", + "start_timestamp": "2023-10-16 15:37:42", + "end_timestamp": "2023-10-16 15:38:53", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online shopping platform, if there are 5 users simultaneously searching in the database table containing 5 columns, 2,000,000 rows, each column size 100 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an e-commerce scenario, there is a database named 'ECommerceDB' that stores information about various products. Within this database, there is a table named 'ProductDetails' that contains 2,000,000 rows of data, each representing a product. This table has 5 columns including product ID, name, price, description, and product status, with each column having a size of 100 characters. Additionally, there are 5 users who are simultaneously performing search queries on the platform. However, due to the lack of necessary indexes on commonly used search columns, such as name and description, the database struggles to efficiently handle these concurrent search requests. This inefficiency can lead to anomalies in the database, resulting in negative impacts on user experience and the overall operation of the e-commerce platform.\n" + }, + "467": { + "start_time": "1697441993", + "end_time": "1697442064", + "start_timestamp": "2023-10-16 15:39:53", + "end_timestamp": "2023-10-16 15:41:04", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a social media platform, if there are 10 users searching in a database table with 5 columns, 2,000,000 rows, each column consisting of 100 characters, but the search lacks the necessary index, it could lead to performance issues and a slower search process.\n", + "desc": "In an e-commerce platform, there is a database used for storing information about various products. This database contains a table named 'ProductDetails', which has 2,000,000 rows of data, each representing a product. The table has 5 columns, each with a size of 100 characters. These columns might include product ID, name, price, stock quantity, and brand. However, there is an issue with missing indexes in this table, particularly on commonly used columns such as name, brand, and price. This lack of indexes causes inefficiency when handling a large number of concurrent search requests. As a result, the database experiences anomalies, such as delays or failures in search queries, which negatively impact user experience and the overall operation of the e-commerce platform.\n" + }, + "468": { + "start_time": "1697442124", + "end_time": "1697442195", + "start_timestamp": "2023-10-16 15:42:04", + "end_timestamp": "2023-10-16 15:43:15", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online shopping platform, if there are 10 users searching in a database table containing 5 columns, 2,000,000 rows, each column size of 100 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In the e-commerce scenario, there is a database used for storing information about various products. This database contains a table called 'ProductDetails', which has 2,000,000 rows of data representing different products. Each row in this table has 5 columns, including product ID, name, price, stock quantity, and description, with each column having a size of 100 characters. However, there are no necessary indexes in the table, especially on commonly queried columns like name or price. This lack of indexes can lead to inefficiencies when searching for products, resulting in slower search speeds and potentially impacting the user experience and operational efficiency of the e-commerce platform.\n" + }, + "469": { + "start_time": "1697442281", + "end_time": "1697442352", + "start_timestamp": "2023-10-16 15:44:41", + "end_timestamp": "2023-10-16 15:45:52", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, if there are 5 users searching in a database table containing 5 columns, 4,000,000 rows, each column with a size of 100 characters, but the search lacks the necessary index, it may cause an exception in the database.\n", + "desc": "In an e-commerce platform, there is a database called 'ECommerceDB' that stores information about various products. This database contains a key table named 'ProductDetails' that has a total of 4,000,000 rows of data, each representing a product. The table consists of 5 columns, each with a size of 100 characters. These columns might include product ID, name, price, stock quantity, and category. However, there is a lack of necessary indexes on commonly used search columns such as name and category. As a result, when 5 users simultaneously search for products through the platform, the database becomes inefficient in handling these concurrent search requests. This inefficiency can lead to anomalies in the database, causing delays or failures in search operations and impacting the overall user experience and operational efficiency of the e-commerce platform.\n" + }, + "470": { + "start_time": "1697442413", + "end_time": "1697442484", + "start_timestamp": "2023-10-16 15:46:53", + "end_timestamp": "2023-10-16 15:48:04", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for an online store, if there are 5 users searching in a table containing 5 columns, 4,000,000 rows, with each column size of 100 characters, but the search lacks the necessary index, it can result in an exception in the database.\n", + "desc": "In the e-commerce database of an online store, there is a table named 'ProductDetails' that stores information about various products. This table contains 4 million rows, each representing a product, with 5 columns. These columns include product ID, name, price, stock quantity, and brand, with each column having a size of 100 characters. However, due to the lack of necessary indexes on commonly used search columns, such as name and brand, the database's performance is affected when handling search queries. This means that when users search for products by name or brand, the database takes a long time to process those queries due to the missing indexes. This can lead to a decline in user experience and adversely affect the operational efficiency and reputation of the online store.\n" + }, + "471": { + "start_time": "1697442544", + "end_time": "1697442616", + "start_timestamp": "2023-10-16 15:49:04", + "end_timestamp": "2023-10-16 15:50:16", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial database system, 10 users are simultaneously searching for data in a table with 5 columns, containing 4,000,000 rows. Each column has a size of 100 characters. However, the search operation lacks the necessary indexes, resulting in a delay or exception in retrieving the desired information.\n", + "desc": "In an e-commerce scenario, there is a database named 'ECommerceDB' which is used for storing information about various products. Within the database, there is a table called 'ProductDetails' which contains 4,000,000 rows of data, each representing a product. This table consists of 5 columns, each column containing information of up to 100 characters. These columns include product ID, name, price, stock quantity, and category. However, due to the lack of necessary indexes on the 'ProductDetails' table, especially on commonly used columns such as name and category, the database struggles to efficiently handle search queries. In particular, when 10 users simultaneously search for products through the platform's search interface, the database becomes inefficient in processing these concurrent search requests. This inefficiency leads to anomalies in the database, resulting in decreased user experience and potentially affecting the operational efficiency and reputation of the entire e-commerce platform.\n" + }, + "472": { + "start_time": "1697442676", + "end_time": "1697442748", + "start_timestamp": "2023-10-16 15:51:16", + "end_timestamp": "2023-10-16 15:52:28", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of a social media platform, if there are 10 users searching in a database table containing 5 columns, 4,000,000 rows, each column size of 100 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In the banking industry, there is a database named 'BankingDB' that stores customer and transaction data for a bank. This database contains a key table called 'TransactionRecords' which records detailed information about various banking transactions. The table consists of 4,000,000 rows of data, each representing a transaction record, with 5 columns. These columns may include transaction ID, customer ID, transaction type (such as deposit, withdrawal, or transfer), transaction amount, and transaction date and time. However, this database lacks necessary indexes on commonly used columns such as customer ID and transaction type. As a result, when multiple users simultaneously query the database, especially for specific customer transactions or transaction types, the database becomes inefficient in handling these concurrent search requests. This inefficiency can lead to anomalies, such as delays or failures in retrieving transaction records, affecting both user experience and the bank's operational efficiency.\n" + }, + "473": { + "start_time": "1697442822", + "end_time": "1697442893", + "start_timestamp": "2023-10-16 15:53:42", + "end_timestamp": "2023-10-16 15:54:53", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online gaming platform's database, if there are 5 players searching for game statistics in a table with 10 columns and 2 million rows, each column containing data of size 50 characters, but the search operation lacks the necessary index, it may result in an exception in the database.\n", + "desc": "In the e-commerce scenario of an online store, there is a database named 'StoreDB' that stores information about various products. Within this database, there is a table called 'ProductCatalog' that contains 2,000,000 rows of data, representing different products. This table has 10 columns including product ID, name, price, description, brand, category, size, color, weight, and availability. Each column can hold up to 50 characters of data. However, this table lacks necessary indexes, particularly on commonly used search columns such as name, brand, and category. As a result, when users search for products through the store's search interface, the database becomes inefficient in handling a large number of concurrent search requests. This inefficiency can lead to anomalies in the database, such as delays or failures in search functionality, impacting the user experience and the overall operational efficiency of the online store.\n" + }, + "474": { + "start_time": "1697442953", + "end_time": "1697443024", + "start_timestamp": "2023-10-16 15:55:53", + "end_timestamp": "2023-10-16 15:57:04", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, there is a scenario where 5 users are searching in a table with 10 columns and 2,000,000 rows. Each column size is 50 characters. However, the search is missing the necessary indexes, which can lead to a database exception. This can occur in situations where the platform's search functionality is not optimized with the appropriate indexes.\n", + "desc": "In an e-commerce scenario, there is a database used by an online store called 'OnlineStoreDB'. This database contains a table called 'ProductDetails', which stores information about various products. Within this table, there are 2 million rows of data, each representing a product, and it includes 10 columns with a size of 50 characters each. These columns may include product ID, name, price, stock quantity, brand, category, size, color, weight, and user rating. The online store experiences a high volume of concurrent search requests from users. However, due to the lack of necessary indexes on commonly used search columns, such as name, brand, and category, the database becomes inefficient in handling these search requests. This inefficiency can lead to anomalies in the database, such as search delays or failures, which can negatively impact the user experience and the overall efficiency of the online store.\n" + }, + "475": { + "start_time": "1697443084", + "end_time": "1697443155", + "start_timestamp": "2023-10-16 15:58:04", + "end_timestamp": "2023-10-16 15:59:15", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a sales management system, there are 10 concurrent users searching in a table with 10 columns. This table contains 2,000,000 rows, with each column having a size of 50 characters. The search operation lacks the necessary index, resulting in a database exception.\n", + "desc": "In the e-commerce database of an online store, there is a table called 'ProductDetails' that stores information about various products. This table contains 2,000,000 rows of data, with 10 columns. The columns include product ID, name, price, stock quantity, brand, category, size, color, weight, and description, with each column having a size of 50 characters. However, the table lacks necessary indexes, particularly on commonly used columns such as name, brand, and category. As a result, when multiple users search for products simultaneously, the database struggles to efficiently handle the requests, leading to delays or failures in the search process. This negatively impacts user experience and may affect the overall efficiency and reputation of the e-commerce platform.\n" + }, + "476": { + "start_time": "1697443215", + "end_time": "1697443287", + "start_timestamp": "2023-10-16 16:00:15", + "end_timestamp": "2023-10-16 16:01:27", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online platform, there are threads of 10 users searching for data in a table with 10 columns. The table contains 2,000,000 rows, with each column having a size of 50 characters. However, the search operation lacks the necessary indexes, leading to a database exception.\n", + "desc": "In the database of an e-commerce platform, there is a database specifically used for storing information about various types of products. This database contains a table named 'ProductDetails', which records detailed information about each product. Suppose this table contains 2,000,000 rows of data, each row representing a product entry, with a total of 10 columns, each containing information of up to 50 characters. These columns may include product ID, name, price, stock quantity, brand, category, size, color, weight, and user rating. When users search for products on the platform, they might input specific search terms such as 'smartphone' or 't-shirt'. However, due to the lack of necessary indexes on commonly used search columns like name or category, the database might struggle to efficiently process these search requests. As a result, users might experience slow or failed searches, negatively impacting their experience on the platform. Additionally, the lack of indexes can affect the overall operational efficiency and reputation of the e-commerce platform.\n" + }, + "477": { + "start_time": "1697443374", + "end_time": "1697443446", + "start_timestamp": "2023-10-16 16:02:54", + "end_timestamp": "2023-10-16 16:04:06", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial database with 10 columns and 4,000,000 rows, each with a column size of 50 characters, there is a search operation being performed by 5 users simultaneously. However, there is no index on the columns being searched, resulting in a database exception.\n", + "desc": "In an e-commerce database, there is a table called 'ProductInfo' which stores information about various products. This table contains 4,000,000 rows of data, each representing a product, with a total of 10 columns. These columns include product ID, name, price, stock quantity, brand, category, size, color, weight, and user rating. The size of each column is limited to 50 characters. However, there are missing indexes on the commonly used search columns such as name, brand, and category. This lack of indexes causes inefficiency in handling search queries, especially when there are simultaneous search requests. As a result, there can be delays or failures in searching for products, leading to a decline in user experience and potentially affecting the overall performance and reputation of the e-commerce platform.\n" + }, + "478": { + "start_time": "1697443506", + "end_time": "1697443577", + "start_timestamp": "2023-10-16 16:05:06", + "end_timestamp": "2023-10-16 16:06:17", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a social media platform, if there are 5 users searching in the database table containing 10 columns, 4,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an e-commerce platform environment, where large amounts of data are being processed in real-time, there is a database called 'eCommerceDB' that handles product information and sales data. The primary table in this database is called 'ProductDetails', which contains 4 million rows of data, each representing a unique product. This table consists of 10 columns, including product ID, name, price, stock quantity, brand, category, description, image link, and product status.In this scenario, the database does not have the necessary indexes in place to efficiently handle a high number of concurrent search queries. This lack of indexes on commonly used search columns such as product name, brand, and category causes the database to become inefficient in processing search requests. As a result, the search functionality on the e-commerce platform experiences delays and failures, which negatively impacts the user experience. Additionally, the overall operational efficiency and reputation of the platform could be affected.\n" + }, + "479": { + "start_time": "1697443637", + "end_time": "1697443710", + "start_timestamp": "2023-10-16 16:07:17", + "end_timestamp": "2023-10-16 16:08:30", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial data analysis system, 10 analysts simultaneously query a database table containing 10 columns and 4,000,000 rows. Each column has a size of 50 characters. However, there is no index in place for efficient searching, resulting in poor performance and longer query execution times.\n", + "desc": "In an e-commerce scenario, there is a database named 'OnlineStoreDB' for an online store. This database stores information about various products, including product details such as name, price, stock quantity, brand, and category. In this case, the database contains 4,000,000 rows of data in a table named 'ProductDetails'. Each row represents a product entry, and the table has a total of 10 columns, each with a size of 50 characters. However, this database has a performance issue due to the absence of necessary indexes on key columns, such as name, brand, and category. As a result, when multiple users search for products simultaneously, the database struggles to efficiently handle the large number of concurrent search requests. This inefficiency leads to anomalies in the database, such as delays or failures in search results, impacting the user experience and potentially affecting the overall efficiency and reputation of the online store.\n" + }, + "480": { + "start_time": "1697443770", + "end_time": "1697443843", + "start_timestamp": "2023-10-16 16:09:30", + "end_timestamp": "2023-10-16 16:10:43", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of a financial institution, 10 customers are simultaneously searching for their transactions in a table with 10 columns and 4,000,000 rows. Each column has a size of 50 characters. However, there is no index present for the search, resulting in a database exception.\n", + "desc": "In a banking scenario, there is a database called 'BankingDB' that stores customer and transaction data for a bank. Within this database, there is a table named 'AccountTransactions' that records detailed information about various banking transactions. This table contains 4,000,000 rows of data, each representing a transaction record for an account. The table consists of 10 columns, with each column containing information of up to 50 characters. These columns may include transaction ID, account number, transaction type, transaction amount, transaction date, counterparty account information, transaction status, employee ID, transaction location, and currency type.However, the 'AccountTransactions' table lacks the necessary indexes, particularly on commonly used columns such as transaction type, transaction date, and account number. This absence of indexes makes it inefficient for the database to handle a large number of concurrent queries or searches. As a result, there may be delays or failures in processing banking transactions and queries, which can affect the overall efficiency and customer experience of the banking system.\n" + }, + "481": { + "start_time": "1697443927", + "end_time": "1697443998", + "start_timestamp": "2023-10-16 16:12:07", + "end_timestamp": "2023-10-16 16:13:18", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, there are 5 users searching for information in a table with 10 columns and 2 million rows. Each column contains 100 characters. However, there is an issue with missing indexes, which may result in slower search performance and potentially cause database exceptions.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database that stores sensor data for a smart home system. This database is called 'SensorDataDB' and it is designed to handle a large volume of data from various sensors. One of the tables in this database is named 'SensorReadings' and it contains detailed information about the readings from 2,000,000 sensor events. Each event is represented as a row in the table and contains 10 columns, each with a size of 100 characters. These columns store information such as the sensor ID, reading type, reading value, timestamp, location, and status. However, due to the absence of necessary indexes on commonly used columns in the 'SensorReadings' table, such as sensor ID and reading type, the database becomes inefficient in handling a large number of concurrent read operations. This inefficiency can result in delays or failures in processing sensor data, affecting the overall performance and reliability of the smart home system.\n" + }, + "482": { + "start_time": "1697444058", + "end_time": "1697444128", + "start_timestamp": "2023-10-16 16:14:18", + "end_timestamp": "2023-10-16 16:15:28", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by a travel booking website, when 5 users simultaneously search for flights, hotels, and car rentals in a table with 10 columns and 2,000,000 rows, each column being 100 characters long, there is an exception caused by the absence of necessary indexes in the database.\n", + "desc": "In a life scenario involving an Internet of Things (IoT) environment, there is a database named 'SensorDataDB' specifically used for collecting and analyzing sensor data. This database is designed to handle a large volume of data from various types of sensors, such as temperature sensors, humidity sensors, pressure sensors, light sensors, and motion sensors. The primary table in the database is called 'SensorReadings' and it contains 2,000,000 rows of data, each representing a reading from a sensor. The table has 10 columns, each with a size of 100 characters, which store information about the sensor ID, type, reading value, timestamp, location, and status.However, due to the lack of necessary indexes in the 'SensorReadings' table, particularly on commonly used columns like sensor ID and timestamp, the database becomes inefficient in processing queries that require searching or filtering based on these columns. This inefficiency leads to anomalies in the database, as it struggles to handle a large number of concurrent data insertions or query requests. As a result, the database may experience performance degradation, delays, or even failures while processing these requests. This can impact the reliability and real-time monitoring capabilities of the IoT system, affecting the overall efficiency and effectiveness of the sensor data analysis process.\n" + }, + "483": { + "start_time": "1697444189", + "end_time": "1697444260", + "start_timestamp": "2023-10-16 16:16:29", + "end_timestamp": "2023-10-16 16:17:40", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by a financial institution, if there are 10 parallel transactions running in a table with 10 columns, each column capable of storing 100 characters, and a total of 2 million rows, but the necessary indexes are missing, this can lead to a performance degradation in the database.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database used for collecting and analyzing sensor data called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings' and it contains 2,000,000 rows of data, with each row representing a reading from a sensor. The table has 10 columns, each column containing information up to 100 characters in size. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, there are 10 threads executing concurrently, each thread performing different operations on the database. However, due to the missing indexes on important columns such as sensor ID, reading type, and timestamp, the database is unable to efficiently process these concurrent operations. This inefficiency can lead to anomalies in the database, affecting the overall performance and potentially leading to delays or failures in processing sensor data.\n" + }, + "484": { + "start_time": "1697444320", + "end_time": "1697444392", + "start_timestamp": "2023-10-16 16:18:40", + "end_timestamp": "2023-10-16 16:19:52", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial database, if there are 10 users searching in the database table containing 10 columns, 2,000,000 rows, each column size of 100 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database named 'iotDB' that stores data collected from various IoT devices. This database contains a key table named 'SensorData', which records sensor readings from 2000000 devices. Each row in this table represents a reading from a single device, and there are a total of 10 columns, each of which can store up to 100 characters. These columns may include device ID, sensor type, sensor value, timestamp, location, and other relevant information. In this scenario, due to the lack of necessary indexes on commonly used columns, such as sensor type or timestamp, the database becomes inefficient in processing queries that involve filtering or searching based on these columns. As a result, when 10 users simultaneously request queries that involve filtering or searching on these columns, the database struggles to handle the high concurrency, leading to performance degradation and inefficiency in processing these queries. This can impact the real-time analysis and decision-making processes that rely on the timely and accurate processing of these queries.\n" + }, + "485": { + "start_time": "1697444504", + "end_time": "1697444576", + "start_timestamp": "2023-10-16 16:21:44", + "end_timestamp": "2023-10-16 16:22:56", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of a financial institution, if 5 employees are searching in a database table containing 10 columns and 4,000,000 rows, each column size being 100 characters, but the search lacks the necessary index, an exception will occur in the database.\n", + "desc": "In this Internet of Things (IoT) scenario, let's consider a database system used for collecting and analyzing sensor data from various IoT devices. This database, named 'IoTDataDB', is designed to handle a large volume of sensor data and support real-time data processing. One of the key tables in this database is the 'SensorData' table, which stores data from different sensors, such as temperature, humidity, pressure, light, motion, etc. Each row in this table represents a sensor reading and contains information about the sensor ID, reading type, reading value, timestamp, location, and status. Suppose this table contains 4,000,000 rows of data, with a total of 10 columns, each column capable of storing up to 100 characters.In this scenario, the database experiences poor performance and efficiency due to missing indexes on the 'SensorData' table. These missing indexes particularly impact commonly used search and filtering operations on columns such as sensor ID, timestamp, or location. Without proper indexes, the database is unable to efficiently process concurrent read and search requests, leading to suboptimal query performance and increased response times.For example, when 5 users simultaneously query the 'SensorData' table for sensor readings matching specific criteria, such as temperature values within a certain range or readings from a specific time period, the database struggles to quickly find and retrieve the relevant data. As a result, users experience delays in receiving query results, and the overall efficiency of the IoT data processing system is compromised.To resolve this issue, it is recommended to create appropriate indexes on the 'SensorData' table, specifically targeting the columns commonly used in query conditions. These indexes will allow the database to quickly locate the required data, improving query performance and reducing response times. By implementing these missing indexes in the 'IoTDataDB' database, the overall efficiency and effectiveness of the IoT data processing system can be significantly improved.\n" + }, + "486": { + "start_time": "1697444636", + "end_time": "1697444708", + "start_timestamp": "2023-10-16 16:23:56", + "end_timestamp": "2023-10-16 16:25:08", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used for a financial analysis application, if 5 users search for data in a table containing 10 columns and 4,000,000 rows, each column storing data up to 100 characters long, but the necessary indexes are missing, it may result in a database exception.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database specifically designed to store and analyze sensor data, called 'SensorDataDB'. This database is used to collect and process data from various types of sensors. One of the key tables in this database is 'SensorReadings', which contains information about sensor readings, such as temperature, humidity, pressure, light, motion, and more. This table consists of 4,000,000 rows of data, each representing a reading from a sensor. It includes 10 columns, each with a size of 100 characters. These columns might include sensor ID, reading type, reading value, timestamp, sensor location, and status information. In this scenario, the database is experiencing efficiency issues due to the absence of necessary indexes on commonly used search columns in the 'SensorReadings' table. This lack of indexes makes it difficult for the database to handle a large number of concurrent search requests efficiently. This inefficiency leads to anomalies in the database, causing delays or failures in processing these search requests. Not only does this impact the user experience, but it can also affect the overall operational efficiency and reputation of the IoT system.\n" + }, + "487": { + "start_time": "1697444769", + "end_time": "1697444842", + "start_timestamp": "2023-10-16 16:26:09", + "end_timestamp": "2023-10-16 16:27:22", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large-scale database of an online platform, 10 users are searching for information in a table with 10 columns, containing 4,000,000 rows, where each column has a size of 100 characters. However, the search is missing the necessary indexes, resulting in a database exception.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database called 'IoTDataDB' that stores data collected from various sensors deployed in smart home devices. This database is designed to handle a large volume of sensor data and includes a table named 'SensorReadings' that records detailed information from these sensors. The table consists of 4,000,000 rows of data, with each row representing a sensor reading and containing 10 columns. These columns may include sensor ID, reading type (such as temperature, humidity, motion), reading value, timestamp, location, and other relevant information. However, in this scenario, due to the absence of necessary indexes on commonly used columns, such as sensor ID and reading type, the database is inefficient in processing queries that involve filtering by these columns. This inefficiency becomes apparent when there are 10 concurrent threads attempting to access the database, with each thread simulating sensor readings. The lack of indexes causes the database to struggle in handling these concurrent requests, leading to delays or failures in query processing. This not only affects the user experience but also hampers the operational efficiency of the smart home system.\n" + }, + "488": { + "start_time": "1697444903", + "end_time": "1697444976", + "start_timestamp": "2023-10-16 16:28:23", + "end_timestamp": "2023-10-16 16:29:36", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a business database system, if there are 10 users concurrently searching a table with 10 columns, each containing 100 characters, and a total of 4,000,000 rows of data, but the necessary indexes are missing, it may result in exceptions or slower performance in the database.\n", + "desc": "In an IoT scenario, let's consider a database used for collecting and analyzing sensor data, called 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the tables in this database is named 'SensorReadings', which stores data from different sensors. This table consists of 4,000,000 rows, each representing a reading from a sensor, and has a total of 10 columns. These columns may include sensor ID, reading type, reading value, timestamp, sensor location, and status information. However, in this scenario, there is a problem with the performance of the database. Due to the lack of necessary indexes, particularly on commonly used columns such as sensor ID and timestamp, the database becomes inefficient in handling a large number of concurrent read and write requests. This inefficiency leads to anomalies in the database, as it cannot efficiently process these numerous concurrent requests. This not only results in delays or failures in data processing, but it may also affect the accuracy and timeliness of the collected sensor data.\n" + }, + "489": { + "start_time": "1697445061", + "end_time": "1697445132", + "start_timestamp": "2023-10-16 16:31:01", + "end_timestamp": "2023-10-16 16:32:12", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a social media platform, if there are 5 users searching in the database table containing 20 columns, 2,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an Internet of Things (IoT) scenario, there is a database used for storing and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains various fields to store data from 2,000,000 sensors. These fields may include sensor ID, reading type (such as temperature, humidity, pressure, light, motion, etc.), reading value, timestamp, sensor location, and status information. When the database is not properly indexed, particularly on commonly used search columns such as sensor type, reading value, and timestamp, it becomes inefficient in handling a large number of concurrent search requests. This inefficiency eventually leads to anomalies in the database, as it cannot efficiently process these numerous concurrent search requests. This not only results in a decline in system performance (e.g., search delays or failures) but may also affect the overall effectiveness and reliability of the IoT system.\n" + }, + "490": { + "start_time": "1697445192", + "end_time": "1697445263", + "start_timestamp": "2023-10-16 16:33:12", + "end_timestamp": "2023-10-16 16:34:23", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online shopping platform, there are 5 users searching in the database table containing 20 columns, 2,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an Internet of Things (IoT) scenario, let's consider a database that is used to store sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. One of the key tables in the database is called 'SensorReadings', which stores detailed information about the readings from these sensors. Each row in the table represents a single reading from a specific sensor and contains information such as sensor ID, reading type, reading value, timestamp, and sensor location. In this particular scenario, the database has a total of 2,000,000 rows in the 'SensorReadings' table, with each row containing 20 columns. The size of each column is 50 characters. However, there is an issue with missing indexes in this table, particularly on commonly used search columns such as sensor ID, reading type, and timestamp. Due to the lack of these indexes, the database becomes inefficient in handling search queries that require filtering or sorting based on these columns. This inefficiency can lead to longer query execution times and decreased overall performance of the database. Ultimately, this can impact the effectiveness of data analysis and decision-making processes that rely on the sensor data stored in the database.\n" + }, + "491": { + "start_time": "1697445323", + "end_time": "1697445395", + "start_timestamp": "2023-10-16 16:35:23", + "end_timestamp": "2023-10-16 16:36:35", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online shopping platform, when 10 users search the database table containing 20 columns and 2,000,000 rows, with each column size of 50 characters, but the search lacks the necessary index, an exception occurs in the database.\n", + "desc": "In a database used for an e-commerce platform, named 'ECommerceDB', there is a table called 'ProductDetails' which contains information about various products. This table consists of 2 million rows of data, with each row representing a product and having 20 columns. These columns include product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. However, in this table, there is a lack of necessary indexes on commonly used search columns such as name, brand, and category. As a result, when multiple users simultaneously search for products on the e-commerce platform, the database becomes inefficient in handling these search requests, leading to delays or failures in the search process. This inefficiency and the resulting anomalies not only degrade the user experience but also impact the overall operational efficiency and reputation of the e-commerce platform.\n" + }, + "492": { + "start_time": "1697445455", + "end_time": "1697445527", + "start_timestamp": "2023-10-16 16:37:35", + "end_timestamp": "2023-10-16 16:38:47", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online shopping platform, if there are 10 users searching in the database table containing 20 columns, 2,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database called 'IoTDataDB' that stores data from various IoT devices. One table in this database, named 'DeviceReadings', contains information from 2,000,000 rows, each representing a reading from a different IoT device. This table has 20 columns, including device ID, timestamp, temperature, humidity, pressure, light intensity, motion detection, and more. Each column can store data up to 50 characters in size. However, the 'DeviceReadings' table lacks necessary indexes on commonly queried columns such as device ID, timestamp, and temperature. As a result, querying this table for specific devices, time periods, or temperature ranges becomes inefficient, especially when there are multiple users or applications concurrently accessing the database. The absence of these indexes leads to a decline in query performance and increases the time required to retrieve data from the 'DeviceReadings' table.\n" + }, + "493": { + "start_time": "1697445639", + "end_time": "1697445711", + "start_timestamp": "2023-10-16 16:40:39", + "end_timestamp": "2023-10-16 16:41:51", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online banking system, if 5 users search the database table containing 20 columns and 4,000,000 rows, where each column has a size of 50 characters, but the search does not have the necessary index, it could result in a database exception.\n", + "desc": "In a banking scenario, suppose there is a database named 'BankRecordsDB' that stores various banking information, such as customer details, account information, transaction records, and more. Within this database, there is a table called 'AccountTransactions' that contains 4,000,000 rows of transaction data, with each row representing a specific transaction. This table has 20 columns including transaction ID, account number, transaction type, transaction amount, date and time, counterparty information, status, and more. However, there is a lack of necessary indexes on commonly used columns, such as account number, transaction type, and transaction amount. As a result, the database becomes inefficient in handling concurrent queries and updates related to transaction information. This inefficiency can lead to slower response times, higher resource consumption, and potential anomalies in transaction processing within the banking system.\n" + }, + "494": { + "start_time": "1697445771", + "end_time": "1697445843", + "start_timestamp": "2023-10-16 16:42:51", + "end_timestamp": "2023-10-16 16:44:03", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online shopping platform, there is an issue with missing indexes. The script is being used to simulate the scenario where 5 users are searching in a table with 20 columns, each having a size of 50 characters, and 4 million rows. This lack of indexes can cause performance issues and potentially lead to exceptions in the database.\n", + "desc": "In this scenario, we can imagine a database used in an Internet of Things (IoT) system. Let's say this database is called \"IoTDevicesDB\" and is responsible for storing data from various IoT devices. It has a main table named \"DeviceData\" that records data from these devices. This table contains about 4,000,000 rows, each representing a data entry from a device. It has 20 columns, including device ID, device type, timestamp, temperature, humidity, pressure, location, and more, each with a size of 50 characters. However, there is an issue with the database configuration regarding missing indexes. Specifically, there are no indexes created on commonly used columns such as device type, timestamp, and location. This lack of indexes makes the database inefficient in handling a high number of concurrent queries, especially those involving filtering or searching based on these columns. As a result, when multiple users or systems try to query the database simultaneously, the performance might be slow, with delays or even failures in retrieving the desired data. This can degrade the user experience and impact the overall functionality of the IoT system.\n" + }, + "495": { + "start_time": "1697445903", + "end_time": "1697445976", + "start_timestamp": "2023-10-16 16:45:03", + "end_timestamp": "2023-10-16 16:46:16", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the context of a database used by an online shopping platform, there is a situation where 10 users are performing searches on a database table with 20 columns and 4,000,000 rows. Each column has a size of 50 characters. However, there is a lack of necessary indexes in the database, leading to a potential exception or error.\n", + "desc": "In an internet of things (IOT) scenario, consider a database used for collecting and analyzing sensor data, named 'SensorDataDB'. This database is designed to handle a large volume of data from various types of sensors. It contains a primary table called 'SensorReadings', which stores information about sensor readings. The table has 4,000,000 rows of data, each representing a different sensor reading. It includes 20 columns, each with a size of 50 characters. These columns may contain information such as the sensor ID, reading type, reading value, timestamp, sensor location, and status information.However, due to the lack of necessary indexes on commonly used columns, such as sensor ID or reading type, the database becomes inefficient in handling large amounts of data. This lack of indexing leads to anomalies in the database, as it cannot efficiently process the numerous concurrent read and write requests. As a result, there may be delays or failures in processing the sensor data, impacting the overall performance and reliability of the IOT system.\n" + }, + "496": { + "start_time": "1697446036", + "end_time": "1697446109", + "start_timestamp": "2023-10-16 16:47:16", + "end_timestamp": "2023-10-16 16:48:29", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online shopping platform's database, when 10 users simultaneously search for information in a table with 20 columns and 4,000,000 rows, and each column is limited to a size of 50 characters, an exception will occur if there is no index created for the search operation.\n", + "desc": "In an Internet of Things (IoT) scenario, imagine a database named 'SensorDataDB', which is specifically designed to store and analyze sensor data. This database is used to handle a large volume of data from various types of sensors. The primary table in the database is called 'SensorReadings', which contains information from 4,000,000 sensor readings. Each reading consists of 20 columns, with each column able to store up to 50 characters of data. These columns might include sensor ID, reading type (such as temperature, humidity, pressure, light, motion), reading value, timestamp, location, and status information. In this scenario, multiple sensors are constantly transmitting data to the database at a high frequency. However, due to the lack of necessary indexes, particularly on commonly used search columns such as timestamp and sensor ID, the database becomes inefficient in handling a large number of concurrent data entry requests. This inefficiency eventually leads to anomalies in the database, as it cannot efficiently process these numerous concurrent data entry requests. This not only results in a decline in the real-time monitoring capabilities of the IoT system but may also affect the overall operational efficiency and reliability of the entire system.\n" + }, + "497": { + "start_time": "1697446221", + "end_time": "1697446293", + "start_timestamp": "2023-10-16 16:50:21", + "end_timestamp": "2023-10-16 16:51:33", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of an online shopping platform, 5 users are simultaneously searching in a table with 20 columns and 2 million rows of data. Each column has a size of 100 characters. However, the search is lacking the necessary indexes, resulting in a database exception.\n", + "desc": "In an e-commerce platform, there is a database called 'ProductDatabase' used to store product information. In this database, there is a table named 'ProductDetails' which contains 2,000,000 rows of data, with each row representing a product entry. This table has 20 columns, each having a size of 100 characters. These columns store information such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. However, due to missing indexes on commonly used search columns like name, brand, and category, the database struggles to efficiently handle a high number of concurrent search requests. This can lead to delays or failures in search operations, negatively impacting the user experience and the overall efficiency of the e-commerce platform.\n" + }, + "498": { + "start_time": "1697446353", + "end_time": "1697446425", + "start_timestamp": "2023-10-16 16:52:33", + "end_timestamp": "2023-10-16 16:53:45", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online shopping platform, 5 users perform searches on a database table with 20 columns and 2,000,000 rows. Each column can hold up to 100 characters. However, the search queries lack the necessary indexes, which causes a database exception to occur.\n", + "desc": "In an Internet of Things (IoT) scenario, suppose there is a database specifically designed for storing sensor data, named 'SensorDataDB'. This database is used to collect and analyze data from various types of sensors, such as temperature, humidity, pressure, light, and motion sensors. Within this database, there is a key table called 'SensorReadings' that stores the readings from these sensors. This table contains 2,000,000 rows of data, each representing a reading from a sensor, with a total of 20 columns. These columns include sensor ID, reading type, reading value, timestamp, sensor location, and status information. However, in this scenario, due to the lack of necessary indexes on commonly used search columns, such as sensor ID, reading type, and timestamp, the database becomes inefficient in handling a large number of concurrent search requests. This inefficiency leads to anomalies in the database, as it cannot efficiently process these numerous search requests. This can result in delays or failures in retrieving sensor data, which can impact the functionality and performance of the entire IoT system.\n" + }, + "499": { + "start_time": "1697446485", + "end_time": "1697446557", + "start_timestamp": "2023-10-16 16:54:45", + "end_timestamp": "2023-10-16 16:55:57", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online shopping platform, there are 10 users simultaneously searching in a table with 20 columns and 2,000,000 rows. Each column has a size of 100 characters. However, an exception occurs because there is no index available for the search operation, causing performance issues in retrieving the required data.\n", + "desc": "In the database of an IoT platform, let's call it 'IoTPlatformDB', there is a database used for storing and analyzing sensor data. The main table in this database is called 'SensorReadings', which contains information from various sensors. This table consists of 2,000,000 rows of data, each representing a reading from a sensor. There are 20 columns in this table, including sensor ID, sensor type, location, reading value, timestamp, and other relevant information. However, due to the lack of necessary indexes, particularly on commonly used columns such as sensor ID and timestamp, the database becomes inefficient in processing a large volume of sensor data. This inefficiency can lead to anomalies in the database, such as delays in recording sensor readings or failures in retrieving specific readings. This can impact the overall functionality and performance of the IoT platform, affecting the real-time monitoring and analysis of sensor data.\n" + }, + "500": { + "start_time": "1697446617", + "end_time": "1697446690", + "start_timestamp": "2023-10-16 16:56:57", + "end_timestamp": "2023-10-16 16:58:10", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large database used by an e-commerce platform, 10 users are simultaneously searching for information in a table with 20 columns. The table contains 2 million rows, with each column having a size of 100 characters. However, there is a missing index on the database table, which leads to an exception being raised in the system.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'BusinessIntelligenceDB' that stores various business-related data. Within this database, there is a table named 'DataRecords' that contains 2,000,000 rows of data, each representing a specific business record. This table consists of 20 columns, each with a size of 100 characters. These columns might include information such as transaction ID, customer ID, product ID, sales amount, date, time, location, and other details relevant to business operations. However, due to the absence of necessary indexes, particularly on commonly used columns for data retrieval and analysis purposes, such as customer ID, product ID, and transaction date, the database encounters inefficiency in handling a large number of concurrent queries and join operations. This inefficiency eventually leads to anomalies in the database, as it cannot efficiently process these numerous concurrent operations. This not only results in a decline in the performance of business intelligence queries and analysis but may also affect the decision-making process and operational efficiency of the entire organization.\n" + }, + "501": { + "start_time": "1697446896", + "end_time": "1697446967", + "start_timestamp": "2023-10-16 17:01:36", + "end_timestamp": "2023-10-16 17:02:47", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "This statement is simulating a scenario in an online shopping platform where 5 users are searching the database table for information. The table contains 20 columns with data size of 100 characters and 4,000,000 rows. However, there is a missing index, which can cause a database exception or error when trying to perform the search operation.\n", + "desc": "In an e-commerce platform, there is a database called 'ECommerceDB' that stores information about various products. One of the key tables in this database is 'ProductDetails', which contains 4,000,000 rows of data, each representing a product. This table has 20 columns, including product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. Each column can store up to 100 characters of data.However, there are missing indexes in the 'ProductDetails' table, particularly on commonly used search columns such as name, brand, and category. This lack of indexes makes it inefficient for the database to handle a high volume of concurrent search requests. As a result, when 5 users simultaneously search for products through the platform's search interface, the database may experience delays or failures in processing these requests. This inefficiency not only affects the user experience but also has implications for the operational efficiency and reputation of the entire e-commerce platform.\n" + }, + "502": { + "start_time": "1697447027", + "end_time": "1697447101", + "start_timestamp": "2023-10-16 17:03:47", + "end_timestamp": "2023-10-16 17:05:01", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for an online shopping platform, 10 users are performing searches in a database table with 20 columns, 4,000,000 rows, and each column has a size of 100 characters. However, there is no index configured for the search operation, resulting in a potential database exception.\n", + "desc": "In a life scenario resembling a banking database, there is a database called 'BankingDB' that is used to store and manage customer and transaction data for a bank. Within this database, there is a key table named 'AccountTransactions' that records detailed information about various banking transactions. This table contains 4,000,000 rows of data, with each row representing a transaction record for a specific account. There are a total of 20 columns in this table, each column having a size of 100 characters. These columns include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, currency type, and more. When multiple users try to access this table with frequent search queries or transaction updates, the database performance might degrade due to the lack of necessary indexes on commonly searched columns. Without the appropriate indexes on columns such as transaction type, account number, or transaction status, the database may struggle to efficiently process the concurrent search requests. This inefficiency can result in a decline in user experience, such as slow query response times or transaction processing delays. It could also potentially impact the overall efficiency and reputation of the banking institution's online banking platform.\n" + }, + "503": { + "start_time": "1697447162", + "end_time": "1697447235", + "start_timestamp": "2023-10-16 17:06:02", + "end_timestamp": "2023-10-16 17:07:15", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online shopping platform with a database table containing 20 columns and 4,000,000 rows, if there are 10 users searching for products using terms like product name, category, and price range, but the necessary indexes are missing, the search operation may result in a database exception.\n", + "desc": "In a business intelligence scenario, there is a database named 'BIAnalyticsDB' used for analyzing and reporting data from various sources. This database contains a key table named 'SalesRecords', which stores detailed information about sales transactions. The table consists of 4,000,000 rows of data, each representing a sales record, with 20 columns, each capable of storing up to 100 characters. These columns may include transaction ID, customer ID, product ID, transaction date, transaction amount, payment method, sales channel, and more.In this scenario, analysts frequently perform queries to analyze sales performance, such as calculating total sales by month, product category, or customer segment. However, due to the lack of necessary indexes on commonly used columns, such as transaction date, product ID, and customer ID, the database is inefficient in processing these queries. As a result, query performance is degraded, causing delays in generating reports and impacting the overall efficiency of the business intelligence process.By running the provided script with the 'MISSING_INDEXES' anomaly trigger, the system will simulate a scenario in which 10 users simultaneously execute queries on the 'SalesRecords' table, causing increased load and highlighting the need for indexing optimization.\n" + }, + "504": { + "start_time": "1697447302", + "end_time": "1697447373", + "start_timestamp": "2023-10-16 17:08:22", + "end_timestamp": "2023-10-16 17:09:33", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online platform, if there are 5 users searching in the database table containing 10 columns, 2,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In the life scenario of an e-commerce platform, let's consider a database named 'ECommerceDB' that stores information about various types of products. Within this database, there is a table called 'ProductDetails' that contains 2,000,000 rows of data, each representing a product entry. The table consists of 10 columns, including product ID, name, price, stock quantity, brand, category, size, color, weight, and user rating, with each column having a size of 50 characters. However, there is a missing index on this table, particularly on commonly used search columns such as name, brand, and category. Due to the absence of these necessary indexes, the database becomes inefficient in handling concurrent search requests. When 5 users simultaneously search for products through the platform's search interface, for example, by entering a product name or brand, the database experiences performance issues. This inefficiency leads to anomalies in the database, which can result in a decline in user experience, such as search delays or failures. It can also impact the operational efficiency and reputation of the entire e-commerce platform.\n" + }, + "505": { + "start_time": "1697447433", + "end_time": "1697447504", + "start_timestamp": "2023-10-16 17:10:33", + "end_timestamp": "2023-10-16 17:11:44", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online platform, there is an issue with missing indexes. This causes exceptions when 5 users perform searches simultaneously in a table with 5 columns and 2,000,000 rows. Each column has a size of 50 characters. The lack of necessary indexes hampers the efficiency of the search process.\n", + "desc": "In a banking scenario with a database named 'BankingDB', the system handles customer and transaction data for a bank. Within this database, there is a table called 'TransactionDetails' that records detailed information about various banking transactions. This table consists of 2,000,000 rows of data, each representing a transaction, and it contains 5 columns with a maximum size of 50 characters. These columns may include transaction ID, account number, transaction type, transaction amount, and transaction date and time. However, due to the lack of necessary indexes on commonly used columns like account number and transaction type, the database may experience inefficiencies when handling concurrent transaction requests. As a result, the processing time for these requests may be delayed, leading to potential anomalies in the database. These anomalies can affect both the customer experience, with delays or failures in transaction processing, as well as the overall operational efficiency and reputation of the bank.\n" + }, + "506": { + "start_time": "1697447564", + "end_time": "1697447635", + "start_timestamp": "2023-10-16 17:12:44", + "end_timestamp": "2023-10-16 17:13:55", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a data analytics platform, there are 10 users searching in a database table with 5 columns, 2,000,000 rows, each column having a size of 50 characters. However, the search query lacks the necessary indexes, resulting in a database exception.\n", + "desc": "In a banking scenario, there is a database named 'BankDB' that stores various financial transaction records. This database contains a table called 'TransactionHistory' that has 2,000,000 rows of data, with each row representing a transaction. The table has 5 columns, including transaction ID, account number, transaction type, transaction amount, and transaction date. The size of each column is 50 characters. However, the database lacks necessary indexes on commonly used columns, such as account number and transaction date. As a result, when multiple users simultaneously perform transaction queries, the database becomes inefficient in handling these concurrent requests. This inefficiency can lead to anomalies in the database, such as slow or failed queries, affecting the overall performance and user experience of the banking system.\n" + }, + "507": { + "start_time": "1697447695", + "end_time": "1697447766", + "start_timestamp": "2023-10-16 17:14:55", + "end_timestamp": "2023-10-16 17:16:06", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a financial institution, there are 10 clients simultaneously searching for information in a table containing 5 columns and 2,000,000 rows. Each column has a size of 50 characters. However, the necessary indexes for efficient searching are missing, resulting in slower database performance.\n", + "desc": "In an e-commerce platform, there is a database specifically used for storing information about various types of products. Within this database, there is a table called 'ProductDetails' that contains 2,000,000 rows of data, each representing a product entry. This table has 5 columns, each with a size of 50 characters, including product ID, name, price, stock quantity, and category. However, this table does not have the necessary indexes, particularly on commonly used columns such as name and category. As a result, when users search for products on the platform, the database becomes inefficient in handling a large number of concurrent search requests. This inefficiency can lead to anomalies in the database, such as search delays or failures, which can negatively impact the user experience and the overall operational efficiency of the e-commerce platform.\n" + }, + "508": { + "start_time": "1697447840", + "end_time": "1697447911", + "start_timestamp": "2023-10-16 17:17:20", + "end_timestamp": "2023-10-16 17:18:31", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial management system, if there are 5 users searching in a database table containing 5 columns and 4,000,000 rows, each column with a size of 50 characters, but the search lacks the necessary index, an exception will occur in the system.\n", + "desc": "In a file sharing system scenario, imagine a database called 'FileShareDB', which is used by users or teams to share files. This database stores information about the files being shared, including metadata such as file names, sizes, uploaders, upload dates, and access permissions. On a typical day, multiple users may be uploading, downloading, or updating files simultaneously. For instance, a team working on a project might frequently upload new versions of files, while other team members download or edit them. Additionally, the system might be used for sharing large files such as presentations, videos, or graphic designs. However, due to the lack of necessary indexes, particularly on commonly accessed columns like file names, sizes, and uploaders, the database may become inefficient in handling the high volume of concurrent file operations. This can lead to delays or failures in file uploads or downloads, hindering the overall file sharing experience. Furthermore, the lack of indexing might also impact the performance of other database operations such as file search or access permission updates. In a file sharing system, these inefficiencies can have a significant impact on user productivity and the overall effectiveness of collaboration within teams.\n" + }, + "509": { + "start_time": "1697447971", + "end_time": "1697448042", + "start_timestamp": "2023-10-16 17:19:31", + "end_timestamp": "2023-10-16 17:20:42", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a healthcare database, if there are 5 doctors searching for patient records in a database table containing 5 columns, 4,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, it may result in slower performance and longer response times.\n", + "desc": "In the banking scenario, suppose there is a database called 'BankDB' that stores various banking information, such as customer data, account details, and transaction records. Within this database, there is a table called 'TransactionRecords' which contains 4,000,000 rows of transaction data. Each row represents a single transaction and includes information such as transaction ID, account number, transaction type, amount, date and time, and other relevant details. However, due to the lack of necessary indexes on commonly used columns, such as account number and transaction type, the database becomes inefficient in handling concurrent transaction queries from multiple users. As a result, the database may experience delays or failures when retrieving or updating transaction information, leading to a decline in the overall efficiency and user experience of the banking system.\n" + }, + "510": { + "start_time": "1697448102", + "end_time": "1697448174", + "start_timestamp": "2023-10-16 17:21:42", + "end_timestamp": "2023-10-16 17:22:54", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial transaction database, if there are 10 concurrent transactions being processed and searching through a database table with 5 columns, 4,000,000 rows, where each column has a size of 50 characters, but the necessary index is missing, it can result in delays and inefficiencies in retrieving transaction data.\n", + "desc": "In a business intelligence scenario, suppose there is a large-scale financial analysis database named 'BI_FinancialDB', which stores and analyzes financial data for various companies. One key table in this database is 'FinancialRecords', which contains extensive information about financial transactions and statements. This table consists of 4,000,000 rows, with each row representing a financial record, and a total of 5 columns, each capable of containing up to 50 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, and department.To support efficient financial analysis queries, such as profit margin calculations, expense breakdowns, or budget analysis, the database administrator needs to create proper indexes on the frequently used columns, such as transaction type, date, or department.However, in the given scenario, the 'FinancialRecords' table lacks necessary indexes on these important columns. As a result, when multiple users simultaneously perform financial analysis queries on the database, the lack of indexes slows down query execution. The database has to resort to full table scans, which extensively read data from disk and consume significant system resources.This inefficiency in query processing not only leads to delays in generating reports and analyzing financial data but can also affect the overall performance and user experience of the business intelligence system.\n" + }, + "511": { + "start_time": "1697448234", + "end_time": "1697448306", + "start_timestamp": "2023-10-16 17:23:54", + "end_timestamp": "2023-10-16 17:25:06", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online bookstore with 10 concurrent users, a database table containing 5 columns and 4,000,000 rows, each column having a size of 50 characters, is searched without the necessary indexes. This results in a database exception due to the lack of optimized indexing for efficient search operations.\n", + "desc": "In a database scenario for a file sharing system, suppose there is a database called 'FileShareDB' that is used by teams or organizations to share files. This database stores both the files themselves and metadata about the files, such as the uploader's information, file size, creation date, modification date, version history, access permissions, and download count. When multiple users are uploading, downloading, or editing files simultaneously, the system may experience difficulties due to missing indexes. For example, if there are not enough indexes on commonly used columns such as file name, uploader, or creation date, the database might struggle to efficiently handle the high concurrency of queries related to these columns. This inefficiency can result in slower search or retrieval times for files, negatively impacting the user experience. Moreover, the lack of indexes can lead to increased resource consumption and longer execution times for queries, ultimately affecting the overall performance and responsiveness of the file sharing system.\n" + }, + "512": { + "start_time": "1697448378", + "end_time": "1697448449", + "start_timestamp": "2023-10-16 17:26:18", + "end_timestamp": "2023-10-16 17:27:29", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a media content platform, if there are 5 users searching in a table containing 5 columns, with each column size of 100 characters, and a total of 2,000,000 rows, but the search operation lacks the necessary indexes, it can result in an exception being raised in the database.\n", + "desc": "In the banking scenario, suppose there is a database used by a bank named 'BankDB' which records the account information of its customers. This database contains a key table called 'AccountDetails' which consists of information about customer accounts, such as account number, customer name, balance, account type, and other related attributes. In this scenario, due to the rapid increase in the number of customer accounts, the 'AccountDetails' table contains a large volume of data, with around 2 million rows. Each row represents an account entry, and there are 5 columns in total, each capable of storing up to 100 characters. However, the database lacks necessary indexes on commonly used search columns such as account number and customer name. This results in inefficiency when handling a large number of concurrent search requests, leading to slower response times and potentially impacting the overall user experience and operational efficiency of the bank.\n" + }, + "513": { + "start_time": "1697448509", + "end_time": "1697448580", + "start_timestamp": "2023-10-16 17:28:29", + "end_timestamp": "2023-10-16 17:29:40", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an online store, when 5 users simultaneously search for products using only 5 columns, but with a large table of 2,000,000 rows, each with a column size of 100 characters, the lack of proper indexing causes a database exception.\n", + "desc": "In an online banking system, suppose there is a database called 'BankingSystemDB' that stores detailed information about customers and their transactions. This database includes a key table called 'TransactionRecords' which contains information about various banking transactions. Each row in this table represents a transaction record for an account, with a total of 5 columns, each containing data up to 100 characters. These columns might include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, and transaction date and time. At one time, 5 users are simultaneously performing transactions on their accounts through the online banking platform. However, due to the lack of necessary indexes on commonly used columns such as account number and transaction type, the database struggles to efficiently handle these concurrent transactions. This inefficiency can lead to anomalies in the database, resulting in a decline in user experience (such as transaction delays or failures), and potentially impacting the overall operational efficiency and reputation of the online banking system.\n" + }, + "514": { + "start_time": "1697448640", + "end_time": "1697448711", + "start_timestamp": "2023-10-16 17:30:40", + "end_timestamp": "2023-10-16 17:31:51", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database of an online platform, 10 users are searching in a table with 5 columns and 2,000,000 rows. Each column has a size of 100 characters. However, the search is lacking the necessary index, resulting in a database exception.\n", + "desc": "In a business intelligence scenario, suppose there is a database named 'BusinessAnalyticsDB', which is used for storing and analyzing various types of business data. In this database, there is a key table named 'SalesData', containing information about sales records. The table consists of 2,000,000 rows of data, with each row representing a sales record and having a total of 5 columns, each capable of storing up to 100 characters. These columns may include sales ID, customer ID, product ID, sales date, and sales amount. However, due to the lack of necessary indexes on commonly used columns, such as customer ID or product ID, the database becomes inefficient in handling a large number of queries or analysis operations on the sales data. This inefficiency can lead to delays or failures in generating reports or performing data analysis, ultimately impacting the effectiveness of decision-making processes in the business.\n" + }, + "515": { + "start_time": "1697448771", + "end_time": "1697448842", + "start_timestamp": "2023-10-16 17:32:51", + "end_timestamp": "2023-10-16 17:34:02", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a social media platform, if there are 10 users searching in the database table containing 5 columns, 2,000,000 rows, each column size of 100 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In a business intelligence scenario, particularly in a database used for financial analysis, there is a table called 'FinancialData' that stores detailed information about financial transactions. This table contains 2,000,000 rows of data, with 5 columns, each column able to accommodate up to 100 characters. These columns might include transaction ID, transaction type (such as income or expenditure), amount, date, and department. However, due to the absence of necessary indexes on commonly used columns, such as transaction type or department, the database struggles to efficiently handle multiple concurrent queries related to financial analysis. This inefficiency not only leads to a decline in query performance (e.g., slow response times), but it also affects the overall efficiency and accuracy of financial analysis in the business intelligence environment.\n" + }, + "516": { + "start_time": "1697448928", + "end_time": "1697448999", + "start_timestamp": "2023-10-16 17:35:28", + "end_timestamp": "2023-10-16 17:36:39", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large-scale database of a financial institution, if 5 users search the database table containing 5 columns, 4 million rows, each column size of 100 characters, but the search lacks the necessary index, it may cause a performance issue due to the absence of the required optimization.\n", + "desc": "In the banking scenario, there is a database called 'BankingDB' that stores customer and transaction data for a bank. Within this database, there is a table called 'TransactionHistory' that records detailed information about various banking transactions. The table contains 4 million rows of data, with each row representing a transaction record. There are 5 columns in the table, each with a size of 100 characters. These columns include transaction ID, account number, transaction type, transaction amount, and transaction date. However, due to the lack of necessary indexes on commonly used columns in the 'TransactionHistory' table, such as account number and transaction type, the database becomes inefficient in handling a large number of concurrent transaction queries. This inefficiency eventually leads to anomalies in the database, as it cannot efficiently process these numerous concurrent queries. This not only results in a decline in user experience, such as delayed transaction processing or failures, but may also affect the operational efficiency and reputation of the bank.\n" + }, + "517": { + "start_time": "1697449059", + "end_time": "1697449131", + "start_timestamp": "2023-10-16 17:37:39", + "end_timestamp": "2023-10-16 17:38:51", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a financial analytics system, when performing data analysis on a large dataset containing 4,000,000 rows and 5 columns, with each column having a size of 100 characters, if there are 5 concurrent users searching for specific data, but the necessary indexes on the database table are missing, it may result in a database exception.\n", + "desc": "In the database of a file sharing system, let's assume there is a database named 'FileShareDB', which is used by multiple users or teams to share files. This database stores not only the files themselves but also the metadata associated with the files, such as file names, sizes, upload dates, download counts, and user permissions. However, there is a performance issue related to missing indexes in the database. When multiple users simultaneously search for files by entering keywords or filtering criteria, such as file names or upload dates, the database struggles to efficiently process these search requests due to the absence of necessary indexes, specifically on commonly used search columns. This inefficiency can lead to delays or failures in the search results, negatively impacting the user experience. Additionally, without proper indexing, the database may become inefficient in handling a large number of concurrent search requests, affecting the operational efficiency and reputation of the entire file sharing system.\n" + }, + "518": { + "start_time": "1697449191", + "end_time": "1697449263", + "start_timestamp": "2023-10-16 17:39:51", + "end_timestamp": "2023-10-16 17:41:03", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large database with 10 users performing searches on a table containing 5 columns and 4 million rows, each column having a size of 100 characters, a database exception occurs due to the lack of necessary indexes for efficient searching.\n", + "desc": "In a file sharing system scenario, there is a database used for storing and managing shared files named 'TeamFileShareDB'. This database contains metadata about the files, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During regular usage, multiple users might be simultaneously uploading, downloading, or editing files. For example, a team working on a project might frequently upload new versions of files, while others download them for viewing or editing. The system might also handle large files, such as presentations, video conference recordings, or design drawings. When a large number of concurrent file operations occur, the database faces challenges related to missing indexes. These missing indexes can cause inefficiencies when searching for specific files or retrieving file metadata. Without proper indexing, the database might have to scan through a large number of records, resulting in slower search queries and reduced overall performance. This can lead to delays or failures in tasks such as searching for files, accessing file properties, or updating file permissions.\n" + }, + "519": { + "start_time": "1697449323", + "end_time": "1697449395", + "start_timestamp": "2023-10-16 17:42:03", + "end_timestamp": "2023-10-16 17:43:15", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 5\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system for an online marketplace, when 10 users search for products in a database table with 5 columns, containing 4,000,000 rows, and each column has a size of 100 characters, an exception occurs due to the lack of necessary indexes on the search columns.\n", + "desc": "In a business intelligence scenario, a database named 'SalesAnalysisDB' is used to store and analyze sales data of a company. This database contains a key table named 'SalesRecords', which records detailed information about various sales transactions. The table consists of 4,000,000 rows of data, each representing a sales record, with a total of 5 columns, each containing information of up to 100 characters. These columns may include transaction ID, product ID, sales date, sales amount, and customer ID. In this scenario, due to the lack of necessary indexes on commonly used columns like product ID and customer ID, the database becomes inefficient in handling queries related to these columns. When multiple users simultaneously execute queries that involve searching for specific products or analyzing sales performance by customer, the database struggles to quickly retrieve the relevant data. This inefficiency not only affects the user experience but also hampers the ability to make timely business decisions based on accurate sales data.\n" + }, + "520": { + "start_time": "1697449469", + "end_time": "1697449540", + "start_timestamp": "2023-10-16 17:44:29", + "end_timestamp": "2023-10-16 17:45:40", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for a social media platform, if there are 5 users searching in a database table containing 10 columns, 2,000,000 rows, with each column size of 50 characters, but the search lacks the necessary index, it could result in an exception in the database.\n", + "desc": "In a banking scenario, there is a database named 'BankingDB' used to store and handle customer and transaction data. Within this database, there is a table called 'TransactionRecords' that stores detailed information about various banking transactions. This table consists of 2,000,000 rows of data, each representing a transaction record, with a total of 10 columns, each containing information of up to 50 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and others. However, due to the absence of important indexes on commonly used search columns such as account number and transaction date, the database fails to efficiently handle concurrent search requests. This inefficiency in processing concurrent search queries not only affects user experience by causing delays or failures but also hampers the operational efficiency and reputation of the banking system as a whole.\n" + }, + "521": { + "start_time": "1697449600", + "end_time": "1697449671", + "start_timestamp": "2023-10-16 17:46:40", + "end_timestamp": "2023-10-16 17:47:51", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a company's database for managing employee records, if 5 users simultaneously search for information in a database table containing 10 columns and 2,000,000 rows, each column's size being 50 characters, but the necessary index is missing, it may lead to performance issues and a potential exception in the database.\n", + "desc": "In a database used for an online banking system, there is a table named 'AccountTransactions' that stores information about various banking transactions. This table contains 2,000,000 rows of data, each representing a transaction record. Within this table, there are 10 columns, each with a size of 50 characters, including transaction ID, account number, transaction type, transaction amount, date and time, counterparty account information, transaction status, employee ID, transaction location, and currency type. Due to the large volume of transactions and the lack of necessary indexes on commonly searched columns, such as account number and transaction date, the database is unable to efficiently process concurrent search requests. This inefficiency can result in delays or failures in retrieving transaction information, which can negatively impact the banking system's operations and user experience.\n" + }, + "522": { + "start_time": "1697449731", + "end_time": "1697449803", + "start_timestamp": "2023-10-16 17:48:51", + "end_timestamp": "2023-10-16 17:50:03", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a social media platform, if there are 10 users searching in a database table with 10 columns and 2,000,000 rows, each column having a size of 50 characters, but the search lacks the necessary index, it may lead to a database exception.\n", + "desc": "In a database used for an online banking platform named 'BankingDB', there is a table called 'TransactionDetails', which stores information about various banking transactions. This table consists of 2,000,000 rows, each representing a transaction record, and has 10 columns, each with a size of 50 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, and currency type. Due to the lack of necessary indexes in the 'TransactionDetails' table, particularly on commonly used search columns, the database becomes inefficient in handling a large number of concurrent search requests. This inefficiency eventually leads to anomalies in the database, as it cannot efficiently process these numerous concurrent search requests. This not only results in a decline in user experience (e.g., search delays or failures) but may also affect the operational efficiency and reputation of the entire online banking platform.\n" + }, + "523": { + "start_time": "1697449863", + "end_time": "1697449934", + "start_timestamp": "2023-10-16 17:51:03", + "end_timestamp": "2023-10-16 17:52:14", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database scenario, there are 10 users searching for data in a table with 10 columns and 2 million rows. Each column has a size of 50 characters. However, this search operation is missing the necessary indexes, which can result in a database exception.\n", + "desc": "In a business intelligence scenario, imagine a database named 'BusinessIntelDB' where a large corporation stores and analyzes various business data. This database consists of several tables, including a key table named 'SalesRecords', which contains detailed information about sales transactions. Suppose this table contains 2,000,000 rows of data, with each row representing a sales transaction and a total of 10 columns, each with a size of 50 characters. These columns may include transaction ID, sales date, customer ID, product ID, quantity, price, payment method, and sales status.However, due to the lack of necessary indexes on commonly used columns such as customer ID, product ID, and sales date in the 'SalesRecords' table, the database struggles to efficiently handle a large number of concurrent query requests. This inefficiency leads to anomalies, such as slow query performance, search delays, and potential errors in retrieving sales data. These issues not only negatively impact the efficiency of business analysis but also hinder timely decision-making processes.\n" + }, + "524": { + "start_time": "1697450021", + "end_time": "1697450092", + "start_timestamp": "2023-10-16 17:53:41", + "end_timestamp": "2023-10-16 17:54:52", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by an online marketplace, there are 5 simultaneous search requests being made by users. The search is being performed on a table with 10 columns and 4,000,000 rows, where each column can hold up to 50 characters. However, the search queries are not utilizing the necessary indexes, which can result in slower performance or even database exceptions.\n", + "desc": "In the scenario of an e-commerce platform's database, there is a database used for storing and managing information about various products. This database includes a key table named 'ProductDetails', which contains detailed information about each product, such as product ID, name, price, stock quantity, brand, category, size, color, weight, user rating, number of reviews, production date, expiration date, supplier information, country, shipping method, discount information, image link, description, and product status. The 'ProductDetails' table contains a total of 4,000,000 rows of data, where each row represents a unique product, with 10 columns, each having a size of 50 characters. However, due to the absence of necessary indexes on commonly used search columns, such as name, brand, and category, the database is unable to efficiently handle concurrent search requests, leading to inefficiency and anomalies in the database. This could result in search delays or failures for users, negatively affecting their experience and the overall operational efficiency of the e-commerce platform.\n" + }, + "525": { + "start_time": "1697450152", + "end_time": "1697450223", + "start_timestamp": "2023-10-16 17:55:52", + "end_timestamp": "2023-10-16 17:57:03", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an online banking system, if there are 5 users searching in the database table containing 10 columns, 4,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In an online banking scenario, there is a database called 'BankingDB' that manages customer and transaction data for a bank. Within this database, there is a table called 'AccountTransactions' that stores information about various banking transactions. This table contains 4,000,000 rows of data, with each row representing a transaction record for an account. There are a total of 10 columns in this table, each capable of holding information of up to 50 characters. These columns include transaction ID, account number, transaction type, transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, and currency type. In this scenario, a lack of necessary indexes on commonly used columns, such as account number or transaction type, can lead to inefficient query performance. As a result, when 5 concurrent queries are executed, the database may experience delays or failures in processing these queries, ultimately impacting the overall efficiency of the banking operations.\n" + }, + "526": { + "start_time": "1697450283", + "end_time": "1697450356", + "start_timestamp": "2023-10-16 17:58:03", + "end_timestamp": "2023-10-16 17:59:16", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by a social media platform, when 10 users are searching a database table with 10 columns, 4,000,000 rows, each column having a size of 50 characters, an exception occurs due to the lack of necessary indexes for efficient searching.\n", + "desc": "In a banking scenario, there is a database specifically designed to handle customer and transaction data for a bank. This database is called 'BankingDB'. Within this database, there is a key table named 'AccountTransactions' which records detailed information about various banking transactions. The table contains 4,000,000 rows of data, with each row representing a transaction record for an account. Additionally, the table consists of 10 columns, each containing information up to 50 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, or transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, and currency type, among others. In this scenario, due to the lack of necessary indexes on commonly used columns such as transaction type, account number, and transaction date, the database is inefficient in handling a large number of concurrent transaction queries. As a result, the database experiences performance issues, which can lead to anomalies. These anomalies may result in delays or failures in processing customer transactions and can adversely affect the overall operational efficiency and reputation of the bank.\n" + }, + "527": { + "start_time": "1697450416", + "end_time": "1697450490", + "start_timestamp": "2023-10-16 18:00:16", + "end_timestamp": "2023-10-16 18:01:30", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large database of a financial organization, 10 employees are simultaneously searching for information in a database table with 10 columns and 4,000,000 rows. Each column has a size of 50 characters. However, there is no index created for the search, resulting in slow query performance and possible database exceptions.\n", + "desc": "In an online banking scenario, suppose there is a database named 'BankingDB' that handles customer and transaction data for a bank. Within this database, there is a key table called 'TransactionDetails' that stores detailed information about various banking transactions. This table contains a total of 4,000,000 rows of data, with each row representing a transaction record. There are 10 columns in this table, each capable of storing information up to 50 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, and currency type. In this scenario, due to frequent and concurrent access to the 'TransactionDetails' table, the database might experience performance issues. The lack of necessary indexes, especially on commonly used columns such as transaction type, account number, or transaction date, can make searching for specific transactions inefficient. This inefficiency can lead to anomalies in the database, such as search delays or failures, and can potentially affect other banking operations that rely on transaction data.\n" + }, + "528": { + "start_time": "1697450573", + "end_time": "1697450644", + "start_timestamp": "2023-10-16 18:02:53", + "end_timestamp": "2023-10-16 18:04:04", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a large-scale data processing scenario, where there are 5 concurrent users searching in a database table with 10 columns and 2 million rows, each column containing 100 characters, the absence of necessary indexes in the search operation leads to a database exception.\n", + "desc": "In the database of an online banking system, suppose there is a database named 'BankingDB' used for storing and processing customer and transaction data. Within this database, there is a key table named 'TransactionHistory' that records detailed information about various banking transactions. This table contains 2,000,000 rows of data, each representing a transaction record, with a total of 10 columns, each containing information of up to 100 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, and currency type. However, there are missing indexes on commonly used columns such as account number, transaction type, and transaction date and time. This lack of necessary indexes leads to inefficiencies in retrieving specific transactions or generating transaction history reports. As a result, the database becomes slower and less responsive, potentially causing delays in processing customer requests, generating transaction reports, or performing analytical queries. This can negatively impact the overall efficiency and user experience of the online banking system.\n" + }, + "529": { + "start_time": "1697450704", + "end_time": "1697450775", + "start_timestamp": "2023-10-16 18:05:04", + "end_timestamp": "2023-10-16 18:06:15", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database for an online shopping platform, if 5 users search in a table with 10 columns, each column containing 100 characters, and a total of 2,000,000 rows, but the necessary index is missing, it may result in a database exception.\n", + "desc": "In a banking scenario, suppose there is a database named 'BankingDB' that stores customer and transaction data for a bank. Within this database, there is a table named 'TransactionHistory' that holds detailed information about various transactions. This table contains 2,000,000 rows of data, each representing a transaction, with a total of 10 columns, each column capable of holding up to 100 characters. These columns may include transaction ID, account number, transaction type (e.g., deposit, withdrawal, transfer), transaction amount, transaction date and time, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, and currency type. However, due to the absence of necessary indexes on commonly used columns such as account number, transaction type, and transaction date, the database faces performance inefficiencies when handling a large number of concurrent transaction queries. This inefficiency can result in delayed or failed transactions, impacting the overall operational efficiency and user experience within the banking system.\n" + }, + "530": { + "start_time": "1697450835", + "end_time": "1697450907", + "start_timestamp": "2023-10-16 18:07:15", + "end_timestamp": "2023-10-16 18:08:27", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In an analytics system, 10 users simultaneously search a database table with 10 columns, 2,000,000 rows, each column containing 100 characters. However, the search query lacks the necessary indexes, resulting in an exception being thrown in the database.\n", + "desc": "In a bank's database, there is a key table named 'AccountDetails', which stores information about various bank accounts. This table contains 2,000,000 rows of data, each representing a different account, with a total of 10 columns. These columns might include account number, account holder name, account type (such as savings or checking), balance, transaction history, interest rate, date of account creation, last transaction date, account status, and account fees. However, the database lacks necessary indexes on commonly queried columns such as account number, account holder name, and account type. As a result, when multiple users simultaneously query the database for account information, the lack of indexes causes the database to become inefficient and struggle in handling the large number of concurrent queries. This inefficiency ultimately leads to anomalies in the database, resulting in delayed or failed account information retrieval. This can negatively impact the bank's operations and customer satisfaction.\n" + }, + "531": { + "start_time": "1697450968", + "end_time": "1697451039", + "start_timestamp": "2023-10-16 18:09:28", + "end_timestamp": "2023-10-16 18:10:39", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database system used by a large online platform, 10 users simultaneously search for data in a table with 10 columns and 2,000,000 rows. Each column contains 100 characters. However, the search operation does not utilize the necessary indexes, resulting in a performance issue and potential exceptions in the database.\n", + "desc": "In a banking scenario, there is a database called 'BankingDB' which is used for storing and managing customer and transaction data. Within this database, there is a key table named 'AccountTransactions' which records detailed information about various banking transactions. This table consists of 2,000,000 rows of data, with each row representing a transaction record. The table has 10 columns, including transaction ID, account number, transaction type, transaction amount, transaction date and time, counterparty account information, transaction status, employee ID, transaction location, and currency type. However, there is an issue with the lack of necessary indexes in this table, particularly on commonly used columns such as transaction type, account number, and transaction date. This lack of proper indexing results in inefficiency when handling a large number of concurrent transaction inquiries. As a result, the database experiences anomalies such as slow response times, failed queries, and inefficiency in processing customer transactions.\n" + }, + "532": { + "start_time": "1697451151", + "end_time": "1697451224", + "start_timestamp": "2023-10-16 18:12:31", + "end_timestamp": "2023-10-16 18:13:44", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by a social media platform, 5 users simultaneously perform searches on a database table containing 10 columns, 4,000,000 rows, each column with a size of 100 characters. However, the search operation lacks the necessary indexes, resulting in an exception in the database.\n", + "desc": "In a banking scenario, there is a database named 'BankingDB', which is responsible for storing and managing various customer and transaction data. This database contains a key table called 'TransactionDetails', which records detailed information about various banking transactions. The table consists of 4,000,000 rows of data, with each row representing a transaction record. The table has a total of 10 columns, each containing information of up to 100 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date, counterparty account information, transaction status, employee ID (of the bank staff who processed the transaction), transaction location, currency type, and more. However, due to the lack of necessary indexes in the 'TransactionDetails' table, particularly on commonly used columns such as transaction type, account number, and transaction date, the database becomes inefficient in handling a large number of concurrent transaction queries. This inefficiency eventually leads to anomalies in the database, as it cannot efficiently process these numerous concurrent transaction requests. This not only results in a decline in user experience (e.g., transaction delays or failures) but may also affect the operational efficiency and reputation of the banking system.\n" + }, + "533": { + "start_time": "1697451284", + "end_time": "1697451356", + "start_timestamp": "2023-10-16 18:14:44", + "end_timestamp": "2023-10-16 18:15:56", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In a database used by an e-commerce platform, 5 users are searching for information in a table with 10 columns and 4,000,000 rows, where each column can store up to 100 characters. However, the search queries are not optimized with proper indexes, resulting in poor query performance and potential database exceptions.\n", + "desc": "In the banking scenario, there is a database called 'BankDB' that stores customer account information and transaction records. One crucial table in this database is called 'TransactionHistory', which records detailed information about various banking transactions. This table contains a total of 4,000,000 rows of data, with each row representing a transaction record. There are 10 columns in this table, each capable of storing up to 100 characters. These columns may include transaction ID, account number, transaction type (such as deposit, withdrawal, transfer), transaction amount, transaction date, beneficiary account information, transaction status, and more.However, due to the lack of appropriate indexes, specifically on commonly used columns such as account number, transaction type, and transaction date, the database faces inefficient handling of concurrent queries. For example, when multiple users try to search for transactions using these columns, the database struggles to provide fast and accurate results. This lack of indexes causes performance issues, resulting in slow query execution and delays in accessing the required transaction records. As a result, the customer experience is negatively impacted, and the efficiency and productivity of the banking operations as a whole are compromised.\n" + }, + "534": { + "start_time": "1697451416", + "end_time": "1697451490", + "start_timestamp": "2023-10-16 18:16:56", + "end_timestamp": "2023-10-16 18:18:10", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online platform, 10 users perform searches on a table containing 10 columns and 4,000,000 rows, with each column having a size of 100 characters. However, the search operation lacks the necessary index, leading to a database exception.\n", + "desc": "In an e-commerce platform database, there is a database called 'ECommerceDB' that stores information about various types of products. One of the key tables in this database is 'ProductDetails', which contains detailed information about different products. This table has 4,000,000 rows of data, each representing a product, and has a total of 10 columns. These columns include product ID, name, price, stock quantity, brand, category, size, color, weight, and user rating. Each column can contain up to 100 characters of data.However, due to the lack of necessary indexes on commonly used search columns such as name, brand, and category, the database faces efficiency issues when handling a large number of concurrent search requests. This lack of indexes affects the performance of the search functionality and can result in delays or failures in processing search queries. This inefficiency in handling concurrent search requests can lead to anomalies in the database and negatively impact user experience, operational efficiency, and the reputation of the e-commerce platform. To address this issue, the script 'python anomaly_trigger/main.py' with the anomaly parameter 'MISSING_INDEXES' is used to simulate and trigger anomalies related to missing indexes. The script is configured to use 10 threads for concurrent execution and specifies the number of columns, column size, and number of rows in the 'ProductDetails' table.\n" + }, + "535": { + "start_time": "1697451550", + "end_time": "1697451623", + "start_timestamp": "2023-10-16 18:19:10", + "end_timestamp": "2023-10-16 18:20:23", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 10\n \n # Number of rows to insert\n num_rows = 4000000\n \n # Size of each column (in characters)\n column_size = 100\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of a content management system, 10 users are searching in a database table containing 10 columns and 4,000,000 rows. Each column has a size of 100 characters. However, the search operation does not have the necessary index, resulting in a database exception.\n", + "desc": "In an e-commerce platform's database, there is a database named 'OnlineStoreDB' that stores information about various products. One important table in this database is called 'ProductDetails', which contains a large number of rows representing different products. Each row in this table consists of 10 columns, including information such as product ID, name, price, stock quantity, brand, category, size, color, weight, and description. However, the database lacks indexes on commonly used search columns such as name, brand, and category. As a result, when multiple users search for products simultaneously, the database becomes inefficient in handling these concurrent search requests. This inefficiency can lead to anomalies in the system, causing search delays or failures and potentially affecting the overall operational efficiency and reputation of the e-commerce platform.\n" + }, + "536": { + "start_time": "1697451708", + "end_time": "1697451778", + "start_timestamp": "2023-10-16 18:21:48", + "end_timestamp": "2023-10-16 18:22:58", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the database of an online shopping platform, if there are 5 users searching in the database table containing 20 columns, 2,000,000 rows, each column size of 50 characters, but the search lacks the necessary index, resulting in an exception in the database.\n", + "desc": "In a file sharing scenario, suppose there is a database called 'FileShareDB', which is used by teams or organizations to share files. This database stores not only the files themselves but also their metadata, such as uploader information, file size, creation and modification dates, version history, access permissions, and download counts. During normal usage, multiple users may be simultaneously uploading, downloading, or editing files. For example, a group of colleagues may collaborate on a project, with team members frequently uploading the latest versions of files, while others download them for viewing or editing. Additionally, the system may be used for storing and sharing large files, such as presentations, video conference recordings, or design drawings. However, due to the lack of necessary indexes in the 'FileShareDB' database, particularly on frequently used search columns such as file name, uploader, and category, the database's ability to handle a large number of concurrent search requests is compromised. This inefficiency can result in a decline in user experience, such as slow search results or timeouts. Furthermore, it may affect the overall operational efficiency and reputation of the file sharing platform.\n" + }, + "537": { + "start_time": "1697451838", + "end_time": "1697451909", + "start_timestamp": "2023-10-16 18:23:58", + "end_timestamp": "2023-10-16 18:25:09", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 5\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n", + "description": "In the case of a database for an online shopping platform, if 5 users simultaneously search for data in a database table with 20 columns, containing 2,000,000 rows, where each column has a size of 50 characters, and the necessary indexes are missing, it may lead to a database exception.\n", + "desc": "In the file sharing system scenario, suppose there is a database named 'FileShareSystemDB', which serves as a repository for storing and sharing files among users. This database contains a key table named 'FileDetails', which records information about each file, such as file name, file size, uploader ID, upload time, access permissions, and download counts. In this scenario, the database might face an issue related to missing indexes. When multiple users simultaneously perform file searches or filter files based on specific criteria, such as file size or upload time, the database's search capability might not be efficient enough. This inefficiency is primarily caused by missing indexes on commonly used search columns, such as file name, file size, or upload time. As a result, the database has to perform full table scans or scan large portions of data, leading to slower search responses and decreased overall system performance. This can undermine user experience, particularly when the number of concurrent file search requests is high. Additionally, it can also affect the system's operational efficiency, as these inefficiencies can slow down other database operations, including file uploads, metadata updates, or access control validations.\n" + }, + "538": { + "start_time": "1697451969", + "end_time": "1697452041", + "start_timestamp": "2023-10-16 18:26:09", + "end_timestamp": "2023-10-16 18:27:21", + "alerts": [], + "labels": [ + "missing indexes" + ], + "command": "python anomaly_trigger/main.py --anomaly MISSING_INDEXES", + "script": "import psycopg2\nimport sys\nsys.path.append('/root/DB-GPT/')\nimport time\nimport datetime\nimport random\nimport yaml\nfrom multiprocessing.pool import *\n\n\nclass DBArgs(object):\n\n def __init__(self, dbtype, config, dbname=None):\n self.dbtype = dbtype\n if self.dbtype == 'mysql':\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'com.mysql.jdbc.Driver'\n self.jdbc = 'jdbc:mysql://'\n else:\n self.host = config['host']\n self.port = config['port']\n self.user = config['user']\n self.password = config['password']\n self.dbname = dbname if dbname else config['dbname']\n self.driver = 'org.postgresql.Driver'\n self.jdbc = 'jdbc:postgresql://'\n\nclass Database():\n def __init__(self, args, timeout=-1):\n self.args = args\n self.conn = self.resetConn(timeout)\n\n\n # self.schema = self.compute_table_schema()\n\n def resetConn(self, timeout=-1):\n conn = psycopg2.connect(database=self.args.dbname,\n user=self.args.user,\n password=self.args.password,\n host=self.args.host,\n port=self.args.port)\n return conn\n \n def execute_sqls(self,sql):\n self.conn =self.resetConn(timeout=-1)\n cur = self.conn.cursor()\n cur.execute(sql)\n self.conn.commit()\n cur.close()\n self.conn.close()\n\n def execute_sql_duration(self, duration, sql, max_id=0, commit_interval=500):\n self.conn = self.resetConn(timeout=-1)\n cursor = self.conn.cursor()\n start = time.time()\n cnt = 0\n if duration > 0:\n while (time.time() - start) < duration:\n if max_id > 0:\n id = random.randint(1, max_id - 1)\n cursor.execute(sql + str(id) + ';')\n else:\n cursor.execute(sql)\n cnt += 1\n if cnt % commit_interval == 0:\n self.conn.commit()\n else:\n print(\"error, the duration should be larger than 0\")\n self.conn.commit()\n cursor.close()\n self.conn.close()\n return cnt\n\n def concurrent_execute_sql(self, threads, duration, sql, max_id=0, commit_interval=500):\n pool = ThreadPool(threads)\n results = [pool.apply_async(self.execute_sql_duration, (duration, sql, max_id, commit_interval)) for _ in range(threads)]\n pool.close()\n pool.join()\n return results\n\ndef init():\n #add the config\n config_path = \"/root/DB-GPT/config/tool_config.yaml\"\n with open(config_path, 'r') as config_file:\n config = yaml.safe_load(config_file) \n db_args =DBArgs('pgsql', config)\n return db_args\n\n\n#create a table\ndef create_table(table_name,colsize, ncolumns):\n db=Database(init())\n column_definitions = ', '.join(f'name{i} varchar({colsize})' for i in range(ncolumns))\n creat_sql = f'CREATE TABLE {table_name} (id int, {column_definitions}, time timestamp);'\n db.execute_sqls(creat_sql)\n\n#delete the table\ndef delete_table(table_name):\n db=Database(init())\n delete_sql=f'DROP TABLE if exists {table_name}'\n db.execute_sqls(delete_sql)\n\n#print the current time\ndef print_time():\n current_time = datetime.datetime.now()\n formatted_time = current_time.strftime(\"%Y-%m-%d %H:%M:%S\")\n print(formatted_time)\n\n\ndef missing_index(threads,duration,ncolumns,nrows,colsize,table_name='table1'):\n #create a new table\n print_time()\n db=Database(init())\n delete_table(table_name)\n create_table(table_name,colsize, ncolumns)\n\n # insert some data to be selected \n insert_definitions = ', '.join(f'(SELECT substr(md5(random()::text), 1, {colsize}))' for i in range(ncolumns))\n insert_data=f'insert into {table_name} select generate_series(1,{nrows}),{insert_definitions}, now();'\n db.execute_sqls(insert_data) \n\n #select without the index\n missing_index='select * from '+table_name+' where id='\n db.concurrent_execute_sql(threads,duration,missing_index,nrows)\n\n #delete the table\n delete_table(table_name)\n #print the end time\n print_time()\n\nif __name__ == \"__main__\":\n # Number of threads to use for concurrent inserts\n num_threads = 10\n \n # Duration for which to run the inserts (in seconds)\n insert_duration = None\n \n # Number of columns in the table\n num_columns = 20\n \n # Number of rows to insert\n num_rows = 2000000\n \n # Size of each column (in characters)\n column_size = 50\n \n # Table name\n table_name = 'table1'\n \n # Call the insert_large_data function\n missing_index(num_threads, insert_duration, num_columns, num_rows, column_size, table_name)\n\n\n\n", + "description": "In a database used by an online shopping platform, if there are 10 users simultaneously searching in a database table containing 20 columns, 2 million rows, each column with a size of 50 characters, and the search operation lacks the necessary index, it can lead to a database exception.\n", + "desc": "In a business intelligence scenario, particularly in the context of analyzing financial data, there exists a database named 'FinancialDataDB'. This database contains a key table named 'FinancialRecords', which holds various financial transactions and statement information for a large organization. The table is composed of 2,000,000 rows, each representing a financial record, with a total of 20 columns, each capable of storing up to 50 characters. These columns may include transaction ID, transaction type (such as income, expenditure, assets, liabilities), amount, date, department, project code, budget code, financial year, audit status, and more.When conducting complex financial analysis queries, such as calculating quarterly income or analyzing departmental budgets, the efficiency of these queries may be compromised due to the absence of appropriate database indexes. Specifically, the 'FinancialRecords' table lacks indexes on commonly queried columns, like transaction type, department, or financial year. As a result, the database struggles to process these queries efficiently, leading to delays, increased response times, and even potential anomalies in the system.To resolve these issues and improve query performance, it is recommended to create appropriate indexes on the relevant columns within the 'FinancialRecords' table. By doing so, the database will be able to quickly and efficiently retrieve the desired data, leading to improved analytical capabilities and a more seamless financial reporting process.\n" + } +} diff --git a/marble/environments/db_env_docker/tpch-queries/1.explain.sql b/marble/environments/db_env_docker/tpch-queries/1.explain.sql new file mode 100644 index 00000000..8d02dbdd --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/1.explain.sql @@ -0,0 +1,22 @@ +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-12-01' - interval '84' day +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus +LIMIT 1; diff --git a/marble/environments/db_env_docker/tpch-queries/1.sql b/marble/environments/db_env_docker/tpch-queries/1.sql new file mode 100644 index 00000000..d0309599 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/1.sql @@ -0,0 +1,28 @@ +-- $ID$ +-- TPC-H/TPC-R Pricing Summary Report Query (Q1) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-12-01' - interval ':1' day (3) +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/10.explain.sql b/marble/environments/db_env_docker/tpch-queries/10.explain.sql new file mode 100644 index 00000000..a14253a6 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/10.explain.sql @@ -0,0 +1,69 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1994-07-01' + and o_orderdate < date '1994-07-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc; +-- LIMIT 20;-- using 1585648051 as a seed to the RNG + + +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1994-07-01' + and o_orderdate < date '1994-07-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc +LIMIT 20; diff --git a/marble/environments/db_env_docker/tpch-queries/10.sql b/marble/environments/db_env_docker/tpch-queries/10.sql new file mode 100644 index 00000000..aa52f8ea --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/10.sql @@ -0,0 +1,38 @@ +-- $ID$ +-- TPC-H/TPC-R Returned Item Reporting Query (Q10) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date ':1' + and o_orderdate < date ':1' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc; +:n 20 diff --git a/marble/environments/db_env_docker/tpch-queries/11.explain.sql b/marble/environments/db_env_docker/tpch-queries/11.explain.sql new file mode 100644 index 00000000..72dc0249 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/11.explain.sql @@ -0,0 +1,61 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' + ) +order by + value desc +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' + ) +order by + value desc +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/11.sql b/marble/environments/db_env_docker/tpch-queries/11.sql new file mode 100644 index 00000000..8d8b7445 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/11.sql @@ -0,0 +1,34 @@ +-- $ID$ +-- TPC-H/TPC-R Important Stock Identification Query (Q11) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = ':1' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * :2 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = ':1' + ) +order by + value desc; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/12.explain.sql b/marble/environments/db_env_docker/tpch-queries/12.explain.sql new file mode 100644 index 00000000..98731500 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/12.explain.sql @@ -0,0 +1,63 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + orders, + lineitem +where + o_orderkey = l_orderkey + and l_shipmode in ('TRUCK', 'MAIL') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1995-01-01' + and l_receiptdate < date '1995-01-01' + interval '1' year +group by + l_shipmode +order by + l_shipmode +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + orders, + lineitem +where + o_orderkey = l_orderkey + and l_shipmode in ('TRUCK', 'MAIL') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1995-01-01' + and l_receiptdate < date '1995-01-01' + interval '1' year +group by + l_shipmode +order by + l_shipmode +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/12.sql b/marble/environments/db_env_docker/tpch-queries/12.sql new file mode 100644 index 00000000..2cc7a0f8 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/12.sql @@ -0,0 +1,35 @@ +-- $ID$ +-- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + orders, + lineitem +where + o_orderkey = l_orderkey + and l_shipmode in (':1', ':2') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date ':3' + and l_receiptdate < date ':3' + interval '1' year +group by + l_shipmode +order by + l_shipmode; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/13.explain.sql b/marble/environments/db_env_docker/tpch-queries/13.explain.sql new file mode 100644 index 00000000..b0b59343 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/13.explain.sql @@ -0,0 +1,47 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%unusual%requests%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%unusual%requests%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/13.sql b/marble/environments/db_env_docker/tpch-queries/13.sql new file mode 100644 index 00000000..20dd693e --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/13.sql @@ -0,0 +1,27 @@ +-- $ID$ +-- TPC-H/TPC-R Customer Distribution Query (Q13) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%:1%:2%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/14.explain.sql b/marble/environments/db_env_docker/tpch-queries/14.explain.sql new file mode 100644 index 00000000..79c6933a --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/14.explain.sql @@ -0,0 +1,33 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-01-01' + and l_shipdate < date '1995-01-01' + interval '1' month +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-01-01' + and l_shipdate < date '1995-01-01' + interval '1' month +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/14.sql b/marble/environments/db_env_docker/tpch-queries/14.sql new file mode 100644 index 00000000..27c5b479 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/14.sql @@ -0,0 +1,20 @@ +-- $ID$ +-- TPC-H/TPC-R Promotion Effect Query (Q14) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date ':1' + and l_shipdate < date ':1' + interval '1' month; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/15.explain.sql b/marble/environments/db_env_docker/tpch-queries/15.explain.sql new file mode 100644 index 00000000..87609891 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/15.explain.sql @@ -0,0 +1,73 @@ +-- using 1585648051 as a seed to the RNG + +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1993-11-01' + and l_shipdate < date '1993-11-01' + interval '3' month + group by + l_suppkey; + + +explain select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey +LIMIT 1; + +drop view revenue0;-- using 1585648051 as a seed to the RNG + +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1993-11-01' + and l_shipdate < date '1993-11-01' + interval '3' month + group by + l_suppkey; + + +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey +LIMIT 1; + +drop view revenue0; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/15.sql b/marble/environments/db_env_docker/tpch-queries/15.sql new file mode 100644 index 00000000..77fe2a40 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/15.sql @@ -0,0 +1,40 @@ +-- $ID$ +-- TPC-H/TPC-R Top Supplier Query (Q15) +-- Functional Query Definition +-- Approved February 1998 +:x +create view revenue:s (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date ':1' + and l_shipdate < date ':1' + interval '3' month + group by + l_suppkey; + +:o +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue:s +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue:s + ) +order by + s_suppkey; + +drop view revenue:s; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/16.explain.sql b/marble/environments/db_env_docker/tpch-queries/16.explain.sql new file mode 100644 index 00000000..1b3440f9 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/16.explain.sql @@ -0,0 +1,67 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM ANODIZED%' + and p_size in (33, 37, 43, 3, 25, 40, 15, 41) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM ANODIZED%' + and p_size in (33, 37, 43, 3, 25, 40, 15, 41) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/16.sql b/marble/environments/db_env_docker/tpch-queries/16.sql new file mode 100644 index 00000000..f07a9657 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/16.sql @@ -0,0 +1,37 @@ +-- $ID$ +-- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> ':1' + and p_type not like ':2%' + and p_size in (:3, :4, :5, :6, :7, :8, :9, :10) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/17.explain.sql b/marble/environments/db_env_docker/tpch-queries/17.explain.sql new file mode 100644 index 00000000..e1c022cb --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/17.explain.sql @@ -0,0 +1,32 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part, + (SELECT l_partkey AS agg_partkey, 0.2 * avg(l_quantity) AS avg_quantity FROM lineitem GROUP BY l_partkey) part_agg +where + p_partkey = l_partkey + and agg_partkey = l_partkey + and p_brand = 'Brand#52' + and p_container = 'JUMBO BOX' + and l_quantity < avg_quantity +LIMIT 1; +-- using 1585648051 as a seed to the RNG + + +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part, + (SELECT l_partkey AS agg_partkey, 0.2 * avg(l_quantity) AS avg_quantity FROM lineitem GROUP BY l_partkey) part_agg +where + p_partkey = l_partkey + and agg_partkey = l_partkey + and p_brand = 'Brand#52' + and p_container = 'JUMBO BOX' + and l_quantity < avg_quantity +LIMIT 1; diff --git a/marble/environments/db_env_docker/tpch-queries/17.sql b/marble/environments/db_env_docker/tpch-queries/17.sql new file mode 100644 index 00000000..bca3f1bb --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/17.sql @@ -0,0 +1,24 @@ +-- $ID$ +-- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part +where + p_partkey = l_partkey + and p_brand = ':1' + and p_container = ':2' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + lineitem + where + l_partkey = p_partkey + ); +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/18.explain.sql b/marble/environments/db_env_docker/tpch-queries/18.explain.sql new file mode 100644 index 00000000..d7c7e7c5 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/18.explain.sql @@ -0,0 +1,71 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 313 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate +LIMIT 100;-- using 1585648051 as a seed to the RNG + + +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 313 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate +LIMIT 100; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/18.sql b/marble/environments/db_env_docker/tpch-queries/18.sql new file mode 100644 index 00000000..3f7e1256 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/18.sql @@ -0,0 +1,39 @@ +-- $ID$ +-- TPC-H/TPC-R Large Volume Customer Query (Q18) +-- Function Query Definition +-- Approved February 1998 +:x +:o +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > :1 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate; +:n 100 diff --git a/marble/environments/db_env_docker/tpch-queries/19.explain.sql b/marble/environments/db_env_docker/tpch-queries/19.explain.sql new file mode 100644 index 00000000..9a65fb96 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/19.explain.sql @@ -0,0 +1,77 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#44' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 2 and l_quantity <= 2 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#55' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 18 and l_quantity <= 18 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#44' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 2 and l_quantity <= 2 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#55' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 18 and l_quantity <= 18 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/19.sql b/marble/environments/db_env_docker/tpch-queries/19.sql new file mode 100644 index 00000000..a9c6e5d5 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/19.sql @@ -0,0 +1,42 @@ +-- $ID$ +-- TPC-H/TPC-R Discounted Revenue Query (Q19) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = ':1' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= :4 and l_quantity <= :4 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = ':2' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= :5 and l_quantity <= :5 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = ':3' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= :6 and l_quantity <= :6 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ); +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/2.explain.sql b/marble/environments/db_env_docker/tpch-queries/2.explain.sql new file mode 100644 index 00000000..ac0701f0 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/2.explain.sql @@ -0,0 +1,94 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 24 + and p_type like '%NICKEL' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey +LIMIT 100; +-- using 1585648051 as a seed to the RNG + + +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 24 + and p_type like '%NICKEL' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey +LIMIT 100; diff --git a/marble/environments/db_env_docker/tpch-queries/2.sql b/marble/environments/db_env_docker/tpch-queries/2.sql new file mode 100644 index 00000000..2c941f5a --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/2.sql @@ -0,0 +1,50 @@ +-- $ID$ +-- TPC-H/TPC-R Minimum Cost Supplier Query (Q2) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = :1 + and p_type like '%:2' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = ':3' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = ':3' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey; +:n 100 diff --git a/marble/environments/db_env_docker/tpch-queries/20.explain.sql b/marble/environments/db_env_docker/tpch-queries/20.explain.sql new file mode 100644 index 00000000..a919e527 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/20.explain.sql @@ -0,0 +1,94 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp, + ( + select + l_partkey agg_partkey, + l_suppkey agg_suppkey, + 0.5 * sum(l_quantity) AS agg_quantity + from + lineitem + where + l_shipdate >= date '1997-01-01' + and l_shipdate < date '1997-01-01' + interval '1' year + group by + l_partkey, + l_suppkey + ) agg_lineitem + where + agg_partkey = ps_partkey + and agg_suppkey = ps_suppkey + and ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'powder%' + ) + and ps_availqty > agg_quantity + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +order by + s_name +LIMIT 1; +-- using 1585648051 as a seed to the RNG + + +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp, + ( + select + l_partkey agg_partkey, + l_suppkey agg_suppkey, + 0.5 * sum(l_quantity) AS agg_quantity + from + lineitem + where + l_shipdate >= date '1997-01-01' + and l_shipdate < date '1997-01-01' + interval '1' year + group by + l_partkey, + l_suppkey + ) agg_lineitem + where + agg_partkey = ps_partkey + and agg_suppkey = ps_suppkey + and ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'powder%' + ) + and ps_availqty > agg_quantity + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +order by + s_name +LIMIT 1; diff --git a/marble/environments/db_env_docker/tpch-queries/20.sql b/marble/environments/db_env_docker/tpch-queries/20.sql new file mode 100644 index 00000000..23cecc76 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/20.sql @@ -0,0 +1,44 @@ +-- $ID$ +-- TPC-H/TPC-R Potential Part Promotion Query (Q20) +-- Function Query Definition +-- Approved February 1998 +:x +:o +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp + where + ps_partkey in ( + select + p_partkey + from + part + where + p_name like ':1%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + lineitem + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date ':2' + and l_shipdate < date ':2' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = ':3' +order by + s_name; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/21.explain.sql b/marble/environments/db_env_docker/tpch-queries/21.explain.sql new file mode 100644 index 00000000..3d09d42e --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/21.explain.sql @@ -0,0 +1,85 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +group by + s_name +order by + numwait desc, + s_name +LIMIT 100;-- using 1585648051 as a seed to the RNG + + +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +group by + s_name +order by + numwait desc, + s_name +LIMIT 100; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/21.sql b/marble/environments/db_env_docker/tpch-queries/21.sql new file mode 100644 index 00000000..380bcfd8 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/21.sql @@ -0,0 +1,46 @@ +-- $ID$ +-- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = ':1' +group by + s_name +order by + numwait desc, + s_name; +:n 100 diff --git a/marble/environments/db_env_docker/tpch-queries/22.explain.sql b/marble/environments/db_env_docker/tpch-queries/22.explain.sql new file mode 100644 index 00000000..32e98392 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/22.explain.sql @@ -0,0 +1,81 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('40', '31', '39', '27', '20', '26', '33') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('40', '31', '39', '27', '20', '26', '33') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('40', '31', '39', '27', '20', '26', '33') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('40', '31', '39', '27', '20', '26', '33') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/22.sql b/marble/environments/db_env_docker/tpch-queries/22.sql new file mode 100644 index 00000000..3cf5b615 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/22.sql @@ -0,0 +1,44 @@ +-- $ID$ +-- TPC-H/TPC-R Global Sales Opportunity Query (Q22) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + (':1', ':2', ':3', ':4', ':5', ':6', ':7') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + (':1', ':2', ':3', ':4', ':5', ':6', ':7') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/3.explain.sql b/marble/environments/db_env_docker/tpch-queries/3.explain.sql new file mode 100644 index 00000000..0caa3920 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/3.explain.sql @@ -0,0 +1,23 @@ +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = 'HOUSEHOLD' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-20' + and l_shipdate > date '1995-03-20' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate +LIMIT 10; diff --git a/marble/environments/db_env_docker/tpch-queries/3.sql b/marble/environments/db_env_docker/tpch-queries/3.sql new file mode 100644 index 00000000..7c388289 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/3.sql @@ -0,0 +1,29 @@ +-- $ID$ +-- TPC-H/TPC-R Shipping Priority Query (Q3) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = ':1' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date ':2' + and l_shipdate > date ':2' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate; +:n 10 diff --git a/marble/environments/db_env_docker/tpch-queries/4.explain.sql b/marble/environments/db_env_docker/tpch-queries/4.explain.sql new file mode 100644 index 00000000..ff6d927c --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/4.explain.sql @@ -0,0 +1,22 @@ +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= date '1996-03-01' + and o_orderdate < date '1996-03-01' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority +LIMIT 1; diff --git a/marble/environments/db_env_docker/tpch-queries/4.sql b/marble/environments/db_env_docker/tpch-queries/4.sql new file mode 100644 index 00000000..8a99c8d4 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/4.sql @@ -0,0 +1,28 @@ +-- $ID$ +-- TPC-H/TPC-R Order Priority Checking Query (Q4) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= date ':1' + and o_orderdate < date ':1' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/5.explain.sql b/marble/environments/db_env_docker/tpch-queries/5.explain.sql new file mode 100644 index 00000000..11e716a8 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/5.explain.sql @@ -0,0 +1,56 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'AFRICA' + and o_orderdate >= date '1993-01-01' + and o_orderdate < date '1993-01-01' + interval '1' year +group by + n_name +order by + revenue desc +LIMIT 1; +-- using 1585648051 as a seed to the RNG + + +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'AFRICA' + and o_orderdate >= date '1993-01-01' + and o_orderdate < date '1993-01-01' + interval '1' year +group by + n_name +order by + revenue desc +LIMIT 1; diff --git a/marble/environments/db_env_docker/tpch-queries/5.sql b/marble/environments/db_env_docker/tpch-queries/5.sql new file mode 100644 index 00000000..499a735f --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/5.sql @@ -0,0 +1,31 @@ +-- $ID$ +-- TPC-H/TPC-R Local Supplier Volume Query (Q5) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = ':1' + and o_orderdate >= date ':2' + and o_orderdate < date ':2' + interval '1' year +group by + n_name +order by + revenue desc; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/6.explain.sql b/marble/environments/db_env_docker/tpch-queries/6.explain.sql new file mode 100644 index 00000000..6d3ca981 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/6.explain.sql @@ -0,0 +1,26 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1993-01-01' + and l_shipdate < date '1993-01-01' + interval '1' year + and l_discount between 0.03 - 0.01 and 0.03 + 0.01 + and l_quantity < 25 +LIMIT 1; +-- using 1585648051 as a seed to the RNG + + +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1993-01-01' + and l_shipdate < date '1993-01-01' + interval '1' year + and l_discount between 0.03 - 0.01 and 0.03 + 0.01 + and l_quantity < 25 +LIMIT 1; diff --git a/marble/environments/db_env_docker/tpch-queries/6.sql b/marble/environments/db_env_docker/tpch-queries/6.sql new file mode 100644 index 00000000..8698a28a --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/6.sql @@ -0,0 +1,16 @@ +-- $ID$ +-- TPC-H/TPC-R Forecasting Revenue Change Query (Q6) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date ':1' + and l_shipdate < date ':1' + interval '1' year + and l_discount between :2 - 0.01 and :2 + 0.01 + and l_quantity < :3; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/7.explain.sql b/marble/environments/db_env_docker/tpch-queries/7.explain.sql new file mode 100644 index 00000000..c99dccfc --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/7.explain.sql @@ -0,0 +1,85 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'EGYPT' and n2.n_name = 'UNITED STATES') + or (n1.n_name = 'UNITED STATES' and n2.n_name = 'EGYPT') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'EGYPT' and n2.n_name = 'UNITED STATES') + or (n1.n_name = 'UNITED STATES' and n2.n_name = 'EGYPT') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/7.sql b/marble/environments/db_env_docker/tpch-queries/7.sql new file mode 100644 index 00000000..d1a4441b --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/7.sql @@ -0,0 +1,46 @@ +-- $ID$ +-- TPC-H/TPC-R Volume Shipping Query (Q7) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = ':1' and n2.n_name = ':2') + or (n1.n_name = ':2' and n2.n_name = ':1') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/8.explain.sql b/marble/environments/db_env_docker/tpch-queries/8.explain.sql new file mode 100644 index 00000000..b11e3493 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/8.explain.sql @@ -0,0 +1,81 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + o_year, + sum(case + when nation = 'UNITED STATES' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'ECONOMY BURNISHED COPPER' + ) as all_nations +group by + o_year +order by + o_year +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + o_year, + sum(case + when nation = 'UNITED STATES' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'ECONOMY BURNISHED COPPER' + ) as all_nations +group by + o_year +order by + o_year +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/8.sql b/marble/environments/db_env_docker/tpch-queries/8.sql new file mode 100644 index 00000000..677e06f2 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/8.sql @@ -0,0 +1,44 @@ +-- $ID$ +-- TPC-H/TPC-R National Market Share Query (Q8) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + o_year, + sum(case + when nation = ':1' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = ':2' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = ':3' + ) as all_nations +group by + o_year +order by + o_year; +:n -1 diff --git a/marble/environments/db_env_docker/tpch-queries/9.explain.sql b/marble/environments/db_env_docker/tpch-queries/9.explain.sql new file mode 100644 index 00000000..e7c1e2dd --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/9.explain.sql @@ -0,0 +1,71 @@ +-- using 1585648051 as a seed to the RNG + + +explain select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%puff%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc +LIMIT 1;-- using 1585648051 as a seed to the RNG + + +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%puff%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc +LIMIT 1; \ No newline at end of file diff --git a/marble/environments/db_env_docker/tpch-queries/9.sql b/marble/environments/db_env_docker/tpch-queries/9.sql new file mode 100644 index 00000000..1d635117 --- /dev/null +++ b/marble/environments/db_env_docker/tpch-queries/9.sql @@ -0,0 +1,39 @@ +-- $ID$ +-- TPC-H/TPC-R Product Type Profit Measure Query (Q9) +-- Functional Query Definition +-- Approved February 1998 +:x +:o +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%:1%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc; +:n -1 diff --git a/marble/environments/db_utils/anomaly_detection.py b/marble/environments/db_utils/anomaly_detection.py index f7aca760..145749bc 100644 --- a/marble/environments/db_utils/anomaly_detection.py +++ b/marble/environments/db_utils/anomaly_detection.py @@ -1,40 +1,54 @@ -from typing import Any, Tuple - import numpy as np -def detect_anomalies( - data: np.ndarray[Any, np.dtype[np.float64]], - significance_level: float = 0.2 -) -> Tuple[float, np.ndarray[Any, np.dtype[np.bool_]]]: +def detect_anomalies(data, significance_level=0.2): """ Detects anomalies in the given data using the KS test algorithm. Args: data (numpy.ndarray): 1-D array of data values. - significance_level (float): Level of significance for the KS test (default: 0.2). + significance_level (float): Level of significance for the KS test (default: 0.05). Returns: - Tuple[float, numpy.ndarray]: - - KS statistic (float): The maximum absolute difference between empirical and expected CDFs. - - Boolean array (numpy.ndarray): Array indicating anomalies (True) and non-anomalies (False). + numpy.ndarray: Boolean array indicating anomalies (True) and non-anomalies (False). """ - sorted_data: np.ndarray[Any, np.dtype[np.float64]] = np.sort(data) - n: int = len(sorted_data) + + sorted_data = np.sort(data) + n = len(sorted_data) # Calculate the expected CDF assuming a normal distribution - expected_cdf: np.ndarray[Any, np.dtype[np.float64]] = np.arange(1, n + 1) / n + expected_cdf = np.arange(1, n + 1) / n # Calculate the empirical CDF - empirical_cdf: np.ndarray[Any, np.dtype[np.float64]] = np.searchsorted(sorted_data, sorted_data, side='right') / n + empirical_cdf = np.searchsorted(sorted_data, sorted_data, side='right') / n # Calculate the maximum absolute difference between the expected and empirical CDFs - ks_statistic: float = np.max(np.abs(empirical_cdf - expected_cdf)) + ks_statistic = np.max(np.abs(empirical_cdf - expected_cdf)) # Calculate the critical value based on the significance level and sample size - critical_value: float = np.sqrt(-0.1 * np.log(significance_level / 2) / n) + critical_value = np.sqrt(-0.1 * np.log(significance_level / 2) / n) # Compare the KS statistic with the critical value - anomalies: np.ndarray[Any, np.dtype[np.bool_]] = np.full_like(data, ks_statistic > critical_value, dtype=bool) - - return ks_statistic, anomalies + anomalies = np.where(ks_statistic > critical_value, True, False) + + return ks_statistic.tolist(), anomalies.tolist() + +def describe_data_features(data): + """Describe the features of a given data in natural language.""" + if data == []: + raise Exception("No metric values found for the given time range") + + # compute processed values for the metric + # max (reserve two decimal places) + max_value = round(np.max(np.array(data)), 2) + # min + min_value = round(np.min(np.array(data)), 2) + # mean + mean_value = round(np.mean(np.array(data)), 2) + # deviation + deviation_value = round(np.std(np.array(data)), 2) + # evenly sampled 10 values (reserve two decimal places) + evenly_sampled_values = [round(data[i], 2) for i in range(0, len(data), len(data) // 10)] + + # describe the above five values in a string + return f"the max value is {max_value}, the min value is {min_value}, the mean value is {mean_value}, the deviation value is {deviation_value}, and the evenly_sampled_values are {evenly_sampled_values}." \ No newline at end of file diff --git a/marble/environments/db_utils/diagnostic_kb.py b/marble/environments/db_utils/diagnostic_kb.py new file mode 100644 index 00000000..41b4c1d2 --- /dev/null +++ b/marble/environments/db_utils/diagnostic_kb.py @@ -0,0 +1,160 @@ +import os +import json +from typing import List, Dict, Optional +from dataclasses import dataclass +import re +from collections import defaultdict + +@dataclass +class Diagnostic: + cause_name: str + desc: str + metrics: str + source_file: str + +class DiagnosticKB: + """ + Available Experts (folder names): + - ConfigurationExpert + - CpuExpert + - DiskExpert + - IndexExpert + - IoExpert + - MemoryExpert + - QueryExpert + - RecoveryExpert + - WorkloadExpert + """ + + def __init__(self, base_folder: str = ''): + """Initialize knowledge base from a folder containing expert subdirectories""" + if not base_folder: + current_dir = os.path.dirname(os.path.abspath(__file__)) + knowledge_base_dir = os.path.join(current_dir, 'knowledge_base') + self.base_folder = knowledge_base_dir + else: + self.base_folder = base_folder + + if not os.path.exists(self.base_folder): + raise ValueError(f"Knowledge base directory not found at {self.base_folder}") + + self.diagnostics: List[Diagnostic] = [] + self.cause_to_diagnostic: Dict[str, Diagnostic] = {} + self.load_documents() + + def get_experts(self) -> List[str]: + """Get list of all expert names (folder names)""" + return [d for d in os.listdir(self.base_folder) + if os.path.isdir(os.path.join(self.base_folder, d))] + + def load_documents(self): + """Load all JSON documents from expert subdirectories""" + self.diagnostics = [] + self.cause_to_diagnostic = {} + + for root, dirs, files in os.walk(self.base_folder): + for file in files: + if file.endswith('.json'): + file_path = os.path.join(root, file) + try: + with open(file_path, 'r', encoding='utf-8') as f: + diagnoses = json.load(f) + for diag in diagnoses: + if diag['cause_name'] not in self.cause_to_diagnostic: + diagnostic = Diagnostic( + cause_name=diag['cause_name'], + desc=diag['desc'], + metrics=diag['metrics'], + source_file=file_path + ) + self.diagnostics.append(diagnostic) + self.cause_to_diagnostic[diag['cause_name']] = diagnostic + except json.JSONDecodeError as e: + print(f"Error loading {file_path}: {e}") + + def search(self, query: str, expert: str = '', top_k: int = 3) -> List[Dict]: + """ + Search diagnostics using keyword matching with improved relevance scoring + Args: + query: Search terms + expert: Specific expert to search from (e.g., 'CpuExpert'). Empty string means search all. + top_k: Maximum number of results to return + """ + def calculate_relevance(diagnostic: Diagnostic, search_terms: List[str]) -> tuple: + text = f"{diagnostic.cause_name} {diagnostic.desc} {diagnostic.metrics}".lower() + + scores = { + 'cause_name': 0, + 'desc': 0, + 'metrics': 0 + } + + for term in search_terms: + term = term.lower() + scores['cause_name'] += len(re.findall(r'\b' + re.escape(term) + r'\b', + diagnostic.cause_name.lower())) * 3 + scores['metrics'] += len(re.findall(r'\b' + re.escape(term) + r'\b', + diagnostic.metrics.lower())) * 2 + scores['desc'] += len(re.findall(r'\b' + re.escape(term) + r'\b', + diagnostic.desc.lower())) + + total_score = sum(scores.values()) + return (total_score, scores['cause_name']) + + search_terms = [term.strip() for term in query.split() if term.strip()] + + if not search_terms: + return [] + + # Filter diagnostics by expert if specified + diagnostics_to_search = self.diagnostics + if expert: + expert_path = os.path.join(self.base_folder, expert) + diagnostics_to_search = [ + diag for diag in self.diagnostics + if diag.source_file.startswith(expert_path) + ] + + if not diagnostics_to_search: + print(f"Warning: No diagnostics found for expert '{expert}'") + return [] + + scored_results = [ + (diag, *calculate_relevance(diag, search_terms)) + for diag in diagnostics_to_search + ] + + scored_results.sort(key=lambda x: (x[1], x[2]), reverse=True) + + results = [] + seen_causes = set() + + for diag, total_score, _ in scored_results: + if total_score > 0 and diag.cause_name not in seen_causes: + seen_causes.add(diag.cause_name) + results.append({ + 'cause_name': diag.cause_name, + 'desc': diag.desc, + 'metrics': diag.metrics.split('\n'), + 'score': total_score, + 'source': diag.source_file, + 'expert': os.path.basename(os.path.dirname(os.path.dirname(diag.source_file))) + }) + + if len(results) >= top_k: + break + + return results + + def get_diagnostic_by_cause(self, cause_name: str) -> Optional[Dict]: + """Get specific diagnostic by cause name""" + diag = self.cause_to_diagnostic.get(cause_name) + if diag: + return { + 'cause_name': diag.cause_name, + 'desc': diag.desc, + 'metrics': diag.metrics.split('\n'), + 'source': diag.source_file, + 'expert': os.path.basename(os.path.dirname(os.path.dirname(diag.source_file))) + } + return None \ No newline at end of file diff --git a/marble/environments/db_utils/metrics.py b/marble/environments/db_utils/metrics.py new file mode 100644 index 00000000..5b747691 --- /dev/null +++ b/marble/environments/db_utils/metrics.py @@ -0,0 +1,81 @@ +"""Prometheus queries""" + +allowed_metrics_full_names = { + "cpu_usage": "avg(irate(node_cpu_seconds_total{mode='user'}[1m])) * 100", # Overall CPU usage (user mode) + "memory_usage": "node_memory_MemTotal_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes)", # Overall memory usage + "network_traffic": "irate(node_network_receive_bytes_total[1m]) + irate(node_network_transmit_bytes_total[1m])", # Total network traffic (inbound + outbound) + "io_activity": "irate(node_disk_read_bytes_total[1m]) + irate(node_disk_written_bytes_total[1m])", # Total I/O activity (read + write) +} + +full_metrics_full_names = { + "cpu": { + "cpu_usage": "avg(irate(node_cpu_seconds_total{mode='user'}[1m])) * 100", + "node_scrape_collector_duration_seconds": "node_scrape_collector_duration_seconds", + "node_procs_running": "node_procs_running", + "node_procs_blocked": "node_procs_blocked", + "node_entropy_available_bits": "node_entropy_available_bits", + "node_load1": "node_load1", + "node_load5": "node_load5", + "node_load15": "node_load15", + "pg_settings_random_page_cost": "pg_settings_random_page_cost", + "pg_settings_max_worker_processes": "pg_settings_max_worker_processes", + "pg_settings_max_parallel_workers": "pg_settings_max_parallel_workers", + "pg_active_connection_count": "pg_stat_activity_count{state='active'} != 0" + }, + "memory": { + "memory_usage": "node_memory_MemTotal_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes)", + "node_memory_MemTotal_bytes": "node_memory_MemTotal_bytes", + "node_memory_Cached_bytes": "node_memory_Cached_bytes", + "node_memory_Buffers_bytes": "node_memory_Buffers_bytes", + "node_memory_MemFree_bytes": "node_memory_MemFree_bytes", + "node_memory_Inactive_anon_bytes": "node_memory_Inactive_anon_bytes", + "node_memory_MemAvailable_bytes": "node_memory_MemAvailable_bytes", + "node_memory_Dirty_bytes": "node_memory_Dirty_bytes", + "pg_stat_activity_active_connections": "pg_stat_activity_count{state='active'} != 0", + "pg_settings_shared_buffers_bytes": "pg_settings_shared_buffers_bytes", + "pg_settings_effective_cache_size_bytes": "pg_settings_effective_cache_size_bytes", + "pg_settings_maintenance_work_mem_bytes": "pg_settings_maintenance_work_mem_bytes", + "pg_settings_work_mem_bytes": "pg_settings_work_mem_bytes", + "pg_settings_max_wal_size_bytes": "pg_settings_max_wal_size_bytes", + "pg_stat_bgwriter_buffers_alloc_rate": "irate(pg_stat_bgwriter_buffers_alloc[5m])", + "pg_stat_bgwriter_buffers_backend_fsync_rate": "irate(pg_stat_bgwriter_buffers_backend_fsync[5m])", + "pg_stat_bgwriter_buffers_checkpoint_rate": "irate(pg_stat_bgwriter_buffers_checkpoint[5m])", + "pg_stat_bgwriter_buffers_clean_rate": "irate(pg_stat_bgwriter_buffers_clean[5m])", + "pg_stat_database_conflicts_rate": "irate(pg_stat_database_conflicts[5m])", + "pg_stat_database_deadlocks_rate": "irate(pg_stat_database_deadlocks[5m])" + }, + "network": { + "node_sockstat_tcp_time_wait": "node_sockstat_TCP_tw", + "node_sockstat_tcp_orphan": "node_sockstat_TCP_orphan", + "node_sockstat_tcp_alloc": "node_sockstat_TCP_alloc", + "node_sockstat_tcp_inuse": "node_sockstat_TCP_inuse", + "node_netstat_tcp_passive_opens_rate": "irate(node_netstat_Tcp_PassiveOpens[1m])", + "pg_stat_activity_active_connections": "pg_stat_activity_count{state='active'} != 0" + }, + "io": { + "pg_stat_database_tup_fetched_total": "SUM(pg_stat_database_tup_fetched)", + "pg_stat_database_tup_inserted_total": "SUM(pg_stat_database_tup_inserted)", + "pg_stat_database_tup_updated_total": "SUM(pg_stat_database_tup_updated)", + "process_open_file_descriptors": "process_open_fds", + "pg_stat_database_xact_commit_rate": "irate(pg_stat_database_xact_commit[5m])", + "pg_stat_database_xact_rollback_rate": "irate(pg_stat_database_xact_rollback[5m])", + "pg_stat_database_tup_updated_non_zero": "pg_stat_database_tup_updated != 0", + "pg_stat_database_blks_hit_ratio": "pg_stat_database_blks_hit / (pg_stat_database_blks_read + pg_stat_database_blks_hit)", + "pg_stat_database_temp_bytes_rate": "irate(pg_stat_database_temp_bytes[5m])", + "pg_stat_bgwriter_checkpoint_write_time_rate": "irate(pg_stat_bgwriter_checkpoint_write_time[5m])", + "pg_stat_bgwriter_checkpoint_sync_time_rate": "irate(pg_stat_bgwriter_checkpoint_sync_time[5m])", + "node_filesystem_used_bytes": "node_filesystem_size_bytes - node_filesystem_avail_bytes", + "node_filesystem_size_bytes": "node_filesystem_size_bytes", + "node_filesystem_used_ratio": "1 - (node_filesystem_free_bytes / node_filesystem_size_bytes)", + "node_disk_reads_completed_rate": "irate(node_disk_reads_completed_total[1m])", + "node_disk_writes_completed_rate": "irate(node_disk_writes_completed_total[1m])", + "node_disk_io_in_progress": "node_disk_io_now", + "node_disk_read_bytes_rate": "irate(node_disk_read_bytes_total[1m])", + "node_disk_written_bytes_rate": "irate(node_disk_written_bytes_total[1m])", + "node_disk_io_time_seconds_rate": "irate(node_disk_io_time_seconds_total[1m])", + "node_disk_io_time_weighted_seconds_rate": "irate(node_disk_io_time_weighted_seconds_total[1m])", + "node_disk_read_time_seconds_rate": "irate(node_disk_read_time_seconds_total[1m])", + "node_disk_write_time_seconds_rate": "irate(node_disk_write_time_seconds_total[1m])", + "node_disk_io_time_seconds_rate": "irate(node_disk_io_time_seconds_total[1m])" + } +} \ No newline at end of file diff --git a/marble/environments/db_utils/slow_query.py b/marble/environments/db_utils/slow_query.py new file mode 100644 index 00000000..abd924fa --- /dev/null +++ b/marble/environments/db_utils/slow_query.py @@ -0,0 +1,50 @@ +import psycopg2 +from psycopg2.extras import RealDictCursor + +def obtain_slow_queries(server_address="localhost", + username="test", + password="Test123_456", + database="sysbench", + port="5432", + top_k=10): + try: + connection = psycopg2.connect( + user=username, + password=password, + database=database, + host=server_address, + port=port + ) + + cursor = connection.cursor(cursor_factory=RealDictCursor) + + slow_queries_query = f""" + CREATE EXTENSION pg_stat_statements; + SELECT + query, + total_exec_time + FROM pg_stat_statements + ORDER BY total_exec_time DESC + LIMIT {top_k}; + """ + + cursor.execute(slow_queries_query) + slow_queries = cursor.fetchall() + slow_queries_str = "" + + for idx, record in enumerate(slow_queries, start=1): + slow_queries_str += f"{idx}. Query: {record['query']}\n" + slow_queries_str += f" Total Execution Time: {record['total_exec_time']}\n" + slow_queries_str += "-" * 10 + slow_queries_str += "\n" + + cursor.close() + connection.close() + + return slow_queries_str + + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + obtain_slow_queries() \ No newline at end of file diff --git a/marble/main.py b/marble/main.py index 4e372393..9d437cf5 100644 --- a/marble/main.py +++ b/marble/main.py @@ -18,44 +18,52 @@ def parse_args() -> argparse.Namespace: Returns: argparse.Namespace: Parsed arguments. """ - parser = argparse.ArgumentParser(description="Run the Marble simulation engine.") + parser = argparse.ArgumentParser(description="Run the Marble simulation engine for all YAML config files in a directory.") parser.add_argument( '--config_path', type=str, required=True, - help='Path to the configuration YAML file.' + help='Path to the directory containing configuration YAML files.' ) return parser.parse_args() def main() -> None: """ - Main function to run the simulation with the specified config file. + Main function to run the simulation for each YAML config file in the specified directory. """ args = parse_args() - # Check if the config file exists - if not os.path.isfile(args.config_path): - logging.error(f"Configuration file not found: {args.config_path}") + # Check if the config path is a valid directory + if not os.path.isdir(args.config_path): + logging.error(f"Configuration path is not a directory: {args.config_path}") sys.exit(1) - # Load configuration - try: - config = Config.load(args.config_path) - except Exception as e: - logging.error(f"Error loading configuration from {args.config_path}: {e}") - sys.exit(1) + # Iterate over all YAML files in the directory + for filename in os.listdir(args.config_path): + if filename.endswith(".yaml"): + config_file_path = os.path.join(args.config_path, filename) - # Initialize and start the engine - try: - logging.info(f"Starting engine with configuration: {args.config_path}") - engine = Engine(config) - engine.start() - except Exception: - logging.exception(f"An error occurred while running the engine with configuration: {args.config_path}") - sys.exit(1) + # Load configuration + try: + config = Config.load(config_file_path) + except FileNotFoundError: + logging.error(f"Configuration file not found at path: {config_file_path}") + continue + except Exception as e: + logging.error(f"Error loading configuration from {config_file_path}: {e}") + continue + + # Initialize and start the engine + try: + logging.info(f"Starting engine with configuration: {config_file_path}") + engine = Engine(config) + engine.start() + except Exception: + logging.exception(f"An error occurred while running the engine with configuration: {config_file_path}") + continue if __name__ == '__main__': logging.basicConfig(level=logging.INFO) - main() \ No newline at end of file + main() diff --git a/marble/run_demo.sh b/marble/run_demo.sh index d50e238b..51fe806a 100755 --- a/marble/run_demo.sh +++ b/marble/run_demo.sh @@ -1,7 +1,7 @@ #!/bin/bash # Define the path to the configuration file -CONFIG_FILE="./configs/test_config" +CONFIG_FILE="./configs/test_config_db_single" # Execute the simulation engine with the specified configuration python main.py --config "$CONFIG_FILE" diff --git a/poetry.lock b/poetry.lock index 0273539c..06e78f13 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -2271,6 +2271,82 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +[[package]] +name = "psycopg2-binary" +version = "2.9.10" +description = "psycopg2 - Python-PostgreSQL Database Adapter" +optional = false +python-versions = ">=3.8" +files = [ + {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3e9c76f0ac6f92ecfc79516a8034a544926430f7b080ec5a0537bca389ee0906"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ad26b467a405c798aaa1458ba09d7e2b6e5f96b1ce0ac15d82fd9f95dc38a92"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:270934a475a0e4b6925b5f804e3809dd5f90f8613621d062848dd82f9cd62007"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48b338f08d93e7be4ab2b5f1dbe69dc5e9ef07170fe1f86514422076d9c010d0"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4152f8f76d2023aac16285576a9ecd2b11a9895373a1f10fd9db54b3ff06b4"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32581b3020c72d7a421009ee1c6bf4a131ef5f0a968fab2e2de0c9d2bb4577f1"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2ce3e21dc3437b1d960521eca599d57408a695a0d3c26797ea0f72e834c7ffe5"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e984839e75e0b60cfe75e351db53d6db750b00de45644c5d1f7ee5d1f34a1ce5"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c4745a90b78e51d9ba06e2088a2fe0c693ae19cc8cb051ccda44e8df8a6eb53"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-win32.whl", hash = "sha256:e5720a5d25e3b99cd0dc5c8a440570469ff82659bb09431c1439b92caf184d3b"}, + {file = "psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:3c18f74eb4386bf35e92ab2354a12c17e5eb4d9798e4c0ad3a00783eae7cd9f1"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:04392983d0bb89a8717772a193cfaac58871321e3ec69514e1c4e0d4957b5aff"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1a6784f0ce3fec4edc64e985865c17778514325074adf5ad8f80636cd029ef7c"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f86c56eeb91dc3135b3fd8a95dc7ae14c538a2f3ad77a19645cf55bab1799c"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b3d2491d4d78b6b14f76881905c7a8a8abcf974aad4a8a0b065273a0ed7a2cb"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2286791ececda3a723d1910441c793be44625d86d1a4e79942751197f4d30341"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:512d29bb12608891e349af6a0cccedce51677725a921c07dba6342beaf576f9a"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a507320c58903967ef7384355a4da7ff3f28132d679aeb23572753cbf2ec10b"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6d4fa1079cab9018f4d0bd2db307beaa612b0d13ba73b5c6304b9fe2fb441ff7"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:851485a42dbb0bdc1edcdabdb8557c09c9655dfa2ca0460ff210522e073e319e"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:35958ec9e46432d9076286dda67942ed6d968b9c3a6a2fd62b48939d1d78bf68"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-win32.whl", hash = "sha256:ecced182e935529727401b24d76634a357c71c9275b356efafd8a2a91ec07392"}, + {file = "psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:ee0e8c683a7ff25d23b55b11161c2663d4b099770f6085ff0a20d4505778d6b4"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64"}, + {file = "psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:056470c3dc57904bbf63d6f534988bafc4e970ffd50f6271fc4ee7daad9498a5"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aa0e31fa4bb82578f3a6c74a73c273367727de397a7a0f07bd83cbea696baa"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8de718c0e1c4b982a54b41779667242bc630b2197948405b7bd8ce16bcecac92"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5c370b1e4975df846b0277b4deba86419ca77dbc25047f535b0bb03d1a544d44"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ffe8ed017e4ed70f68b7b371d84b7d4a790368db9203dfc2d222febd3a9c8863"}, + {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8aecc5e80c63f7459a1a2ab2c64df952051df196294d9f739933a9f6687e86b3"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:7a813c8bdbaaaab1f078014b9b0b13f5de757e2b5d9be6403639b298a04d218b"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00924255d7fc916ef66e4bf22f354a940c67179ad3fd7067d7a0a9c84d2fbfc"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7559bce4b505762d737172556a4e6ea8a9998ecac1e39b5233465093e8cee697"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8b58f0a96e7a1e341fc894f62c1177a7c83febebb5ff9123b579418fdc8a481"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b269105e59ac96aba877c1707c600ae55711d9dcd3fc4b5012e4af68e30c648"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:79625966e176dc97ddabc142351e0409e28acf4660b88d1cf6adb876d20c490d"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8aabf1c1a04584c168984ac678a668094d831f152859d06e055288fa515e4d30"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:19721ac03892001ee8fdd11507e6a2e01f4e37014def96379411ca99d78aeb2c"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7f5d859928e635fa3ce3477704acee0f667b3a3d3e4bb109f2b18d4005f38287"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-win32.whl", hash = "sha256:3216ccf953b3f267691c90c6fe742e45d890d8272326b4a8b20850a03d05b7b8"}, + {file = "psycopg2_binary-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:30e34c4e97964805f715206c7b789d54a78b70f3ff19fbe590104b71c45600e5"}, +] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -3955,4 +4031,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.9, <3.12" -content-hash = "c8f5820a6efc0fcab43ea7d21c627d38c32d01d0dac79403c2bb3909005a5f00" +content-hash = "d91e470ccbe9fb80019a92ad0b79856fa1a219507d8d8a5a0a07f073db9ee394" diff --git a/pyproject.toml b/pyproject.toml index e35c90f9..e6ff46e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ pypdf2 = "^3.0.1" tqdm = "^4.67.0" semanticscholar = "^0.8.4" requests = "^2.32.3" +psycopg2-binary = "^2.9.10" [tool.poetry.group.dev.dependencies]