From cd3f4881b8ee45bd00266c3bedea6b459f9c28c3 Mon Sep 17 00:00:00 2001 From: Christiaan Herrewijn Date: Thu, 6 Mar 2025 12:52:06 +0100 Subject: [PATCH 1/3] check for duplicates immediately before creating issue --- scripts/fuzzer_helper.py | 60 +++++++++++++++++++++++++++------------- scripts/run_fuzzer.py | 36 ++++++++++++------------ scripts/run_sqlancer.py | 12 ++++---- 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py index c80cedc..1de340c 100644 --- a/scripts/fuzzer_helper.py +++ b/scripts/fuzzer_helper.py @@ -1,10 +1,8 @@ import json import requests -import sys import os import subprocess -import reduce_sql -import fuzzer_helper +import urllib.parse USERNAME = 'fuzzerofducks' @@ -42,6 +40,12 @@ def issue_url(): return 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) +def issues_by_title_url(issue_title): + base_url = "https://api.github.com/search/issues" + query_string = urllib.parse.quote(f"repo:{REPO_OWNER}/{REPO_NAME} {issue_title} in:title") + return f"{base_url}?q={query_string}" + + def get_token(): if 'FUZZEROFDUCKSKEY' not in os.environ: print("FUZZEROFDUCKSKEY not found in environment variables") @@ -80,7 +84,7 @@ def make_github_issue(title, body): raise Exception("Failed to create issue") -def get_github_issues(page: int) -> list[dict]: +def get_github_issues_per_page(page: int) -> list[dict]: session = create_session() url = issue_url() + '?per_page=100&page=' + str(page) r = session.get(url) @@ -91,6 +95,18 @@ def get_github_issues(page: int) -> list[dict]: return json.loads(r.content.decode('utf8')) +def get_github_issues_by_title(issue_title) -> list[dict]: + session = create_session() + url = issues_by_title_url(issue_title) + r = session.get(url) + if r.status_code != 200: + print('Failed to query the issues') + print('Response:', r.content.decode('utf8')) + raise Exception("Failed to query the issues") + issue_list = r.json().get("items", []) + return issue_list + + def close_github_issue(number): session = create_session() url = issue_url() + '/' + str(number) @@ -150,7 +166,7 @@ def run_shell_command_batch(shell, cmd): return (stdout, stderr, res.returncode, False) -def test_reproducibility(shell, issue, current_errors, perform_check): +def is_reproducible_issue(shell, issue) -> bool: extract = extract_issue(issue['body'], issue['number']) labels = issue['labels'] label_timeout = False @@ -161,8 +177,7 @@ def test_reproducibility(shell, issue, current_errors, perform_check): # failed extract: leave the issue as-is return True sql = extract[0] + ';' - error = extract[1] - if perform_check is True and label_timeout is False: + if label_timeout is False: print(f"Checking issue {issue['number']}...") (stdout, stderr, returncode, is_timeout) = run_shell_command_batch(shell, sql) if is_timeout: @@ -170,24 +185,31 @@ def test_reproducibility(shell, issue, current_errors, perform_check): else: if returncode == 0: return False - if not fuzzer_helper.is_internal_error(stderr): + if not is_internal_error(stderr): return False # issue is still reproducible - current_errors[error] = issue return True -def extract_github_issues(shell, perform_check) -> dict[str, dict]: - current_errors: dict[str, dict] = dict() +def get_github_issues_list() -> list[dict]: + issues: list[dict] = [] for p in range(1, 10): - issues: list[dict] = get_github_issues(p) - for issue in issues: - # check if the github issue is still reproducible - if not test_reproducibility(shell, issue, current_errors, perform_check): - # the issue appears to be fixed - close the issue - print(f"Failed to reproduce issue {issue['number']}, closing...") - close_github_issue(int(issue['number'])) - return current_errors + issues = issues + get_github_issues_per_page(p) + return issues + + +# closes non-reproducible issues; returns reproducible issues +def close_non_reproducible_issues(shell) -> dict[str, dict]: + reproducible_issues: dict[str, dict] = {} + for issue in get_github_issues_list(): + if not is_reproducible_issue(shell, issue): + # the issue appears to be fixed - close the issue + print(f"Failed to reproduce issue {issue['number']}, closing...") + close_github_issue(int(issue['number'])) + else: + reproducible_issues[issue['title']] = issue + # retun open issues as dict, so they can be searched by title, which is the exception message without trace + return reproducible_issues def file_issue(cmd, exception_msg, stacktrace, fuzzer, seed, hash): diff --git a/scripts/run_fuzzer.py b/scripts/run_fuzzer.py index e1395d4..cbc23c3 100644 --- a/scripts/run_fuzzer.py +++ b/scripts/run_fuzzer.py @@ -102,15 +102,26 @@ def run_shell_command(cmd): return (stdout, stderr, res.returncode) -# first get a list of all github issues, and check if we can still reproduce them +def is_known_issue(exception_msg): + existing_issues = fuzzer_helper.get_github_issues_by_title(exception_msg) + if existing_issues: + print("Skip filing duplicate issue") + print( + "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" + + str(existing_issues[0]['number']) + ) + return True + else: + return False + -if no_git_checks: - current_errors: dict[str, dict] = dict() -else: - current_errors: dict[str, dict] = fuzzer_helper.extract_github_issues(shell, perform_checks) +# ========================================== +# START OF SCRIPT +# ========================================== # Don't go on and fuzz if perform checks = true if perform_checks: + fuzzer_helper.close_non_reproducible_issues(shell) exit(0) last_query_log_file = 'sqlsmith.log' @@ -190,12 +201,7 @@ def run_shell_command(cmd): print("=========================================") # check if this is a duplicate issue -if exception_msg in current_errors: - print("Skip filing duplicate issue") - print( - "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" - + str(current_errors[exception_msg]['number']) - ) +if is_known_issue(exception_msg): exit(0) print("=========================================") @@ -212,15 +218,9 @@ def run_shell_command(cmd): exception_msg, stacktrace = fuzzer_helper.split_exception_trace(stderr) # check if this is a duplicate issue -if exception_msg in current_errors: - print("Skip filing duplicate issue") - print( - "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" - + str(current_errors[exception_msg]['number']) - ) +if is_known_issue(exception_msg): exit(0) - print(f"================MARKER====================") print(f"After reducing: the below sql causes an internal error \n `{cmd}`") print(f"{exception_msg}") diff --git a/scripts/run_sqlancer.py b/scripts/run_sqlancer.py index b9b3a0c..70120b3 100644 --- a/scripts/run_sqlancer.py +++ b/scripts/run_sqlancer.py @@ -129,22 +129,22 @@ print(reduced_test_case) (stdout, stderr, returncode) = reduce_sql.run_shell_command(shell, reduced_test_case) -error_msg, _ = fuzzer_helper.split_exception_trace(stderr) +error_msg, trace = fuzzer_helper.split_exception_trace(stderr) print('----------------------------------------------') print("Fetching github issues") print('----------------------------------------------') -# first get a dictinary of all github issues, and check if we can still reproduce them -current_errors = fuzzer_helper.extract_github_issues(shell) +# get a dictinary with all open github issues (close the non-reproducible ones) +open_issues = fuzzer_helper.close_non_reproducible_issues(shell) # check if this is a duplicate issue -if error_msg in current_errors: +if error_msg in open_issues: print("Skip filing duplicate issue") print( "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" - + str(current_errors[error_msg]['number']) + + str(open_issues[error_msg]['number']) ) exit(0) -fuzzer_helper.file_issue(reduced_test_case, error_msg, "SQLancer", seed, git_hash) +fuzzer_helper.file_issue(reduced_test_case, error_msg, trace, "SQLancer", seed, git_hash) From a02a940b04a11fe2f53d779010222afed467e420 Mon Sep 17 00:00:00 2001 From: Christiaan Herrewijn Date: Thu, 6 Mar 2025 15:34:02 +0100 Subject: [PATCH 2/3] sanitize CI stack trace --- scripts/fuzzer_helper.py | 10 +++++++++- scripts/reduce_sql.py | 1 - 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py index 1de340c..34f9474 100644 --- a/scripts/fuzzer_helper.py +++ b/scripts/fuzzer_helper.py @@ -3,6 +3,7 @@ import os import subprocess import urllib.parse +import re USERNAME = 'fuzzerofducks' @@ -242,7 +243,14 @@ def is_internal_error(error): return False +def sanitize_stacktrace(err): + err = re.sub(r'../duckdb\((.*)\)', r'\1', err) + err = re.sub(r'[\+\[]?0x[0-9a-fA-F]+\]?', '', err) + err = re.sub(r'/lib/x86_64-linux-gnu/libc.so(.*)\n', '', err) + return err.strip() + + def split_exception_trace(exception_msg_full: str) -> tuple[str, str]: # exception message does not contain newline, so split after first newline exception_msg, _, stack_trace = exception_msg_full.partition('\n') - return (exception_msg.strip(), stack_trace.strip()) + return (exception_msg.strip(), sanitize_stacktrace(stack_trace)) diff --git a/scripts/reduce_sql.py b/scripts/reduce_sql.py index e2e93a6..fa55ba5 100644 --- a/scripts/reduce_sql.py +++ b/scripts/reduce_sql.py @@ -1,4 +1,3 @@ -import re import subprocess import time import os From b33b671211bdd202412421c632cef3e1f854f9c7 Mon Sep 17 00:00:00 2001 From: Christiaan Herrewijn Date: Thu, 6 Mar 2025 15:47:32 +0100 Subject: [PATCH 3/3] only query open github issues --- scripts/fuzzer_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py index 34f9474..64ad88a 100644 --- a/scripts/fuzzer_helper.py +++ b/scripts/fuzzer_helper.py @@ -43,7 +43,7 @@ def issue_url(): def issues_by_title_url(issue_title): base_url = "https://api.github.com/search/issues" - query_string = urllib.parse.quote(f"repo:{REPO_OWNER}/{REPO_NAME} {issue_title} in:title") + query_string = urllib.parse.quote(f"repo:{REPO_OWNER}/{REPO_NAME} {issue_title} in:title is:open") return f"{base_url}?q={query_string}"