diff --git a/.gitignore b/.gitignore index 203a6db..38dd159 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ __pycache__/ build/ lang2sql.egg-info/ dist/ -.pypirc \ No newline at end of file +.pypirc +mlruns +table_info_db \ No newline at end of file diff --git a/cli/__init__.py b/cli/__init__.py index 1fd9fec..d339cf2 100644 --- a/cli/__init__.py +++ b/cli/__init__.py @@ -1,6 +1,10 @@ import click import subprocess from llm_utils.tools import set_gms_server +from langchain_core.messages import HumanMessage +from llm_evaluation.evaluator import Evaluator +from llm_utils.graph import builder +import os @click.group() @@ -11,12 +15,17 @@ ) @click.option("--run-streamlit", is_flag=True, help="Run the Streamlit app.") @click.option("-p", "--port", type=int, default=8501, help="Streamlit port") -def cli(ctx, datahub_server, run_streamlit, port): +@click.option("--mlflow-tracking-uri", default=None, help="MLflow 트래킹 서버 URI") +def cli(ctx, datahub_server, run_streamlit, port, mlflow_tracking_uri): try: set_gms_server(datahub_server) except ValueError as e: click.echo(str(e)) ctx.exit(1) + + if mlflow_tracking_uri: + os.environ["MLFLOW_TRACKING_URI"] = mlflow_tracking_uri + if run_streamlit: run_streamlit_command(port) @@ -33,3 +42,31 @@ def run_streamlit_command(port): def run_streamlit(port): """Run the Streamlit app.""" run_streamlit_command(port) + + +@cli.command() +@click.argument("dataset_path", type=click.Path(exists=True)) +@click.option( + "--user-database-env", default="clickhouse", help="사용자 데이터베이스 환경" +) +def evaluate(dataset_path, user_database_env): + """SQL 생성 모델을 평가합니다.""" + click.echo(f"데이터셋 {dataset_path}로 평가를 시작합니다...") + + evaluator = Evaluator(dataset_path) + + def generated_sql_fn(question: str): + graph = builder.compile() + + res = graph.invoke( + input={ + "messages": [HumanMessage(content=question)], + "user_database_env": user_database_env, + "best_practice_query": "", + } + ) + + return res["generated_query"].content + + results = evaluator.evaluate(generated_sql_fn) + click.echo(f"평가 완료! {len(results)}개 쿼리 평가됨") diff --git a/docs/evaluation.md b/docs/evaluation.md new file mode 100644 index 0000000..ebb3ff3 --- /dev/null +++ b/docs/evaluation.md @@ -0,0 +1,24 @@ +# Evaluation + +## MLFlow + +### 실행 + +- `docker run --name mlflow_postgres -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=mlflow_db -p 5430:5432 -d postgres` +- `mlflow server --backend-store-uri postgresql://postgres:postgres@localhost:5430/mlflow_db --default-artifact-root ./mlruns --host 0.0.0.0 --port 5000` + - `backed-store-uri`: MLflow가 메타데이터를 저장하는 데이터베이스의 URI를 지정합니다. 예를 들어, PostgreSQL 데이터베이스를 사용할 경우 `postgresql://:@:/` 형식으로 지정합니다. + - `default-artifact-root`: MLflow가 모델 아티팩트를 저장할 기본 경로를 지정합니다. 예를 들어, 로컬 파일 시스템을 사용할 경우 `./mlruns`와 같이 지정할 수 있습니다. + + +## 평가 실행하기 + +### CLI 명령어 + +SQL 생성 모델을 평가하기 위해 다음과 같은 CLI 명령어를 사용할 수 있습니다: +``` +lang2sql evaluate /path/to/dataset.json --user-database-env clickhouse +``` + +### 결과 예시 + +![MLFlow](./mlflow.png) diff --git a/docs/mlflow.png b/docs/mlflow.png new file mode 100644 index 0000000..7fad7e3 Binary files /dev/null and b/docs/mlflow.png differ diff --git a/llm_evaluation/__init__.py b/llm_evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llm_evaluation/dataset.py b/llm_evaluation/dataset.py new file mode 100644 index 0000000..29967a2 --- /dev/null +++ b/llm_evaluation/dataset.py @@ -0,0 +1,45 @@ +import json +import pandas as pd +import os + + +class QADataset: + def __init__(self, dataset_path): + self.dataset_path = dataset_path + self.data = self._load_dataset() + self.results_path = dataset_path.replace(".json", "_results.json") + + def _load_dataset(self): + """JSON 파일에서 Question-Answer 데이터셋을 로드""" + with open(self.dataset_path, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, list): + return pd.DataFrame(data) + elif isinstance(data, dict): + return pd.DataFrame.from_dict(data) + else: + raise ValueError( + "지원되지 않는 JSON 형식입니다. 리스트 또는 딕셔너리 형식이어야 합니다." + ) + + def get_samples(self): + """원본 데이터셋의 질문, 정답 SQL, 평가 타입(evaluation_type) 정보를 반환""" + if "evaluation_type" in self.data.columns: + eval_types = self.data["evaluation_type"].tolist() + else: + eval_types = [None] * len(self.data["inputs"]) + + for i in range(len(self.data["inputs"])): + yield self.data["inputs"][i], self.data["ground_truths"][i], eval_types[i] + + def save_feedback(self, feedback_data): + """평가 결과를 별도 파일에 저장""" + results = [] + if os.path.exists(self.results_path): + with open(self.results_path, "r", encoding="utf-8") as f: + results = json.load(f) + + results.append(feedback_data) + + with open(self.results_path, "w", encoding="utf-8") as f: + json.dump(results, f, ensure_ascii=False, indent=4) diff --git a/llm_evaluation/evaluator.py b/llm_evaluation/evaluator.py new file mode 100644 index 0000000..c15ceca --- /dev/null +++ b/llm_evaluation/evaluator.py @@ -0,0 +1,87 @@ +import time +import mlflow +import os +import importlib.metadata +from langchain_core.messages import HumanMessage + +import numpy as np +from llm_evaluation.dataset import QADataset +from llm_evaluation.llm_evaluator import compare_sql_with_llm +from llm_evaluation.mlflow_logger import log_to_mlflow + + +class Evaluator: + def __init__(self, dataset_path): + self.dataset = QADataset(dataset_path) + + def evaluate(self, generated_sql_fn): + """Lang2SQL 평가 함수 (사용자가 SQL 생성 함수를 제공) + + 각 평가 샘플은 nested run으로 기록됩니다. + """ + results = [] + metrics_by_type = {} # evaluation_type별 점수를 저장할 dict + + # MLflow 설정: tracking URI와 experiment 이름 설정 + mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI")) + try: + lang2sql_version = importlib.metadata.version("lang2sql") + except importlib.metadata.PackageNotFoundError: + lang2sql_version = "unknown" + experiment_name = f"lang2sql-evaluation-v{lang2sql_version}" + mlflow.set_experiment(experiment_name) + + # 전체 평가를 하나의 부모 run으로 감싸기 + with mlflow.start_run(run_name="evaluation_run") as parent_run: + for ( + question, + ground_truth_sql, + evaluation_type, + ) in self.dataset.get_samples(): + start_time = time.time() + generated_sql = generated_sql_fn(question) + exec_time = time.time() - start_time + + # LLM 평가 결과 (현재 단일 점수)를 dict로 기록 + llm_score = compare_sql_with_llm( + generated_sql, ground_truth_sql, question + ) + # evaluation_type별로 점수를 집계 + if evaluation_type not in metrics_by_type: + metrics_by_type[evaluation_type] = [] + metrics_by_type[evaluation_type].append(llm_score) + + feedback_data = { + "question": question, + "generated_sql": generated_sql, + "ground_truth_sql": ground_truth_sql, + "llm_evaluation_metric": llm_score, # 각 쿼리별 metric은 dict 형태로 기록 + "execution_time": exec_time, + "evaluation_type": evaluation_type, + } + + # 각 샘플 평가를 nested run으로 기록 (run 이름으로 evaluation_type 사용) + with mlflow.start_run(nested=True, run_name=str(evaluation_type)): + log_to_mlflow( + question, + generated_sql, + ground_truth_sql, + llm_score, + evaluation_type, + ) + mlflow.log_metric("execution_time", exec_time) + + self.dataset.save_feedback(feedback_data) + results.append(feedback_data) + + # 각 evaluation_type별로 집계한 metric 계산 (평균, 최고, 최저, 중앙값) + aggregated_metrics = {} + for eval_type, scores in metrics_by_type.items(): + + for idx, score in enumerate(scores): + mlflow.log_metric(f"{eval_type}", score, step=idx) + + # aggregated_metrics를 태그로도 기록 (문자열로 변환) + mlflow.set_tag("aggregated_metrics", str(aggregated_metrics)) + + return results diff --git a/llm_evaluation/example.json b/llm_evaluation/example.json new file mode 100644 index 0000000..b6c0602 --- /dev/null +++ b/llm_evaluation/example.json @@ -0,0 +1,13 @@ +{ + "evaluation_type": ["subscription", "hired_sdr", "subscription"], + "inputs": [ + "구독서비스를 이용하는 유니크한 유저수", + "회사의 고용된 영업 SDR 수가 궁금해", + "subscription을 시작한 무료 사용자(free_users) 중 각 플랜별 사용자 수를 알고 싶습니다." + ], + "ground_truths": [ + "SELECT toStartOfMonth(activity_ts) AS month, COUNT(DISTINCT entity_id) AS unique_users FROM client_stream_active_on_subscription WHERE activity_ts >= date_sub(current_date(), interval 3 month) GROUP BY month ORDER BY month", + "SELECT COUNT(DISTINCT entity_id) AS total_hired_sdrs FROM company_stream_hired_sdr WHERE activity_ts >= now() - INTERVAL 3 MONTH", + "SELECT feature_json, COUNT(entity_id) AS user_count FROM client_stream_started_subscription WHERE feature_json LIKE '%\"free_users\"%' GROUP BY feature_json" + ] +} diff --git a/llm_evaluation/llm_evaluator.py b/llm_evaluation/llm_evaluator.py new file mode 100644 index 0000000..9ce1589 --- /dev/null +++ b/llm_evaluation/llm_evaluator.py @@ -0,0 +1,50 @@ +import os +from langchain_core.prompts import ChatPromptTemplate +from llm_utils.llm_factory import get_llm + + +def compare_sql_with_llm(generated_sql, ground_truth_sql, user_query): + """LLM을 사용하여 SQL 평가 (0 ~ 1 점수)""" + + # LLM 초기화 + llm = get_llm( + model_type="openai", + model_name="gpt-4o-mini", + openai_api_key=os.getenv("OPENAI_API_KEY"), + ) + + # 프롬프트 템플릿 생성 + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + f""" + 당신은 SQL 전문가입니다. 다음 두 SQL 쿼리의 정확성과 유사성을 비교해주세요. + 유사성을 0(완전히 다름)에서 1(동일함) 사이의 척도로 평가해주세요. + + 입력 설명 (SQL이 수행해야 할 작업): + {user_query} + + 정답 SQL: + {ground_truth_sql} + + 생성된 SQL: + {generated_sql} + + 정답과의 유사성과 생성된 SQL이 입력 설명을 올바르게 처리하는지 모두 고려하세요. + 0과 1 사이의 유사성 점수만 반환하고, 소수점 둘째 자리까지 반올림하세요(예: 0.75, 0.42, 1.00). + 설명이나 추가 텍스트를 포함하지 마세요. + """, + ) + ] + ) + + # LLM 체인 실행 + chain = prompt | llm + response = chain.invoke({}) + + try: + score = float(response.content.strip()) + return max(0, min(score, 1)) # 0~1 사이 값으로 정규화 + except: + return 0.0 # 오류 발생 시 0점 처리 diff --git a/llm_evaluation/mlflow_logger.py b/llm_evaluation/mlflow_logger.py new file mode 100644 index 0000000..c9f32fa --- /dev/null +++ b/llm_evaluation/mlflow_logger.py @@ -0,0 +1,21 @@ +import os +import importlib.metadata +import mlflow +from dotenv import load_dotenv + +load_dotenv() + + +def log_to_mlflow( + question, generated_sql, ground_truth_sql, llm_metric, evaluation_type=None +): + """활성화된 run 내에서 평가 결과 기록 + + llm_metric은 dict 형태로 기록되며, mlflow.log_param은 내부적으로 문자열로 저장됩니다. + """ + mlflow.log_param("question", question) + mlflow.log_param("generated_sql", generated_sql) + mlflow.log_param("ground_truth_sql", ground_truth_sql) + mlflow.log_param("llm_evaluation_metric", llm_metric) + if evaluation_type is not None: + mlflow.log_param("evaluation_type", evaluation_type) diff --git a/requirements.txt b/requirements.txt index 2c506a8..7b17b47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ streamlit==1.41.1 python-dotenv==1.0.1 faiss-cpu==1.10.0 pre_commit==4.1.0 +mlflow==2.21.0 setuptools wheel twine diff --git a/setup.py b/setup.py index d5e4805..bbf59f4 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ "streamlit==1.41.1", "python-dotenv==1.0.1", "faiss-cpu==1.10.0", + "mlflow==2.21.0", ], entry_points={ "console_scripts": [