Skip to content

test3 #34

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ __pycache__/
build/
lang2sql.egg-info/
dist/
.pypirc
.pypirc
mlruns
table_info_db
39 changes: 38 additions & 1 deletion cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import click
import subprocess
from llm_utils.tools import set_gms_server
from langchain_core.messages import HumanMessage
from llm_evaluation.evaluator import Evaluator
from llm_utils.graph import builder
import os


@click.group()
Expand All @@ -11,12 +15,17 @@
)
@click.option("--run-streamlit", is_flag=True, help="Run the Streamlit app.")
@click.option("-p", "--port", type=int, default=8501, help="Streamlit port")
def cli(ctx, datahub_server, run_streamlit, port):
@click.option("--mlflow-tracking-uri", default=None, help="MLflow 트래킹 서버 URI")
def cli(ctx, datahub_server, run_streamlit, port, mlflow_tracking_uri):
try:
set_gms_server(datahub_server)
except ValueError as e:
click.echo(str(e))
ctx.exit(1)

if mlflow_tracking_uri:
os.environ["MLFLOW_TRACKING_URI"] = mlflow_tracking_uri

if run_streamlit:
run_streamlit_command(port)

Expand All @@ -33,3 +42,31 @@ def run_streamlit_command(port):
def run_streamlit(port):
"""Run the Streamlit app."""
run_streamlit_command(port)


@cli.command()
@click.argument("dataset_path", type=click.Path(exists=True))
@click.option(
"--user-database-env", default="clickhouse", help="사용자 데이터베이스 환경"
)
def evaluate(dataset_path, user_database_env):
"""SQL 생성 모델을 평가합니다."""
click.echo(f"데이터셋 {dataset_path}로 평가를 시작합니다...")

evaluator = Evaluator(dataset_path)

def generated_sql_fn(question: str):
graph = builder.compile()

res = graph.invoke(
input={
"messages": [HumanMessage(content=question)],
"user_database_env": user_database_env,
"best_practice_query": "",
}
)

return res["generated_query"].content

results = evaluator.evaluate(generated_sql_fn)
click.echo(f"평가 완료! {len(results)}개 쿼리 평가됨")
24 changes: 24 additions & 0 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Evaluation

## MLFlow

### 실행

- `docker run --name mlflow_postgres -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=mlflow_db -p 5430:5432 -d postgres`
- `mlflow server --backend-store-uri postgresql://postgres:postgres@localhost:5430/mlflow_db --default-artifact-root ./mlruns --host 0.0.0.0 --port 5000`
- `backed-store-uri`: MLflow가 메타데이터를 저장하는 데이터베이스의 URI를 지정합니다. 예를 들어, PostgreSQL 데이터베이스를 사용할 경우 `postgresql://<username>:<password>@<hostname>:<port>/<database_name>` 형식으로 지정합니다.
- `default-artifact-root`: MLflow가 모델 아티팩트를 저장할 기본 경로를 지정합니다. 예를 들어, 로컬 파일 시스템을 사용할 경우 `./mlruns`와 같이 지정할 수 있습니다.


## 평가 실행하기

### CLI 명령어

SQL 생성 모델을 평가하기 위해 다음과 같은 CLI 명령어를 사용할 수 있습니다:
```
lang2sql evaluate /path/to/dataset.json --user-database-env clickhouse
```

### 결과 예시

![MLFlow](./mlflow.png)
Binary file added docs/mlflow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added llm_evaluation/__init__.py
Empty file.
45 changes: 45 additions & 0 deletions llm_evaluation/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import pandas as pd
import os


class QADataset:
def __init__(self, dataset_path):
self.dataset_path = dataset_path
self.data = self._load_dataset()
self.results_path = dataset_path.replace(".json", "_results.json")

def _load_dataset(self):
"""JSON 파일에서 Question-Answer 데이터셋을 로드"""
with open(self.dataset_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
return pd.DataFrame(data)
elif isinstance(data, dict):
return pd.DataFrame.from_dict(data)
else:
raise ValueError(
"지원되지 않는 JSON 형식입니다. 리스트 또는 딕셔너리 형식이어야 합니다."
)

def get_samples(self):
"""원본 데이터셋의 질문, 정답 SQL, 평가 타입(evaluation_type) 정보를 반환"""
if "evaluation_type" in self.data.columns:
eval_types = self.data["evaluation_type"].tolist()
else:
eval_types = [None] * len(self.data["inputs"])

for i in range(len(self.data["inputs"])):
yield self.data["inputs"][i], self.data["ground_truths"][i], eval_types[i]

def save_feedback(self, feedback_data):
"""평가 결과를 별도 파일에 저장"""
results = []
if os.path.exists(self.results_path):
with open(self.results_path, "r", encoding="utf-8") as f:
results = json.load(f)

results.append(feedback_data)

with open(self.results_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
87 changes: 87 additions & 0 deletions llm_evaluation/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import time
import mlflow
import os
import importlib.metadata
from langchain_core.messages import HumanMessage

import numpy as np
from llm_evaluation.dataset import QADataset
from llm_evaluation.llm_evaluator import compare_sql_with_llm
from llm_evaluation.mlflow_logger import log_to_mlflow


class Evaluator:
def __init__(self, dataset_path):
self.dataset = QADataset(dataset_path)

def evaluate(self, generated_sql_fn):
"""Lang2SQL 평가 함수 (사용자가 SQL 생성 함수를 제공)

각 평가 샘플은 nested run으로 기록됩니다.
"""
results = []
metrics_by_type = {} # evaluation_type별 점수를 저장할 dict

# MLflow 설정: tracking URI와 experiment 이름 설정
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
try:
lang2sql_version = importlib.metadata.version("lang2sql")
except importlib.metadata.PackageNotFoundError:
lang2sql_version = "unknown"
experiment_name = f"lang2sql-evaluation-v{lang2sql_version}"
mlflow.set_experiment(experiment_name)

# 전체 평가를 하나의 부모 run으로 감싸기
with mlflow.start_run(run_name="evaluation_run") as parent_run:
for (
question,
ground_truth_sql,
evaluation_type,
) in self.dataset.get_samples():
start_time = time.time()
generated_sql = generated_sql_fn(question)
exec_time = time.time() - start_time

# LLM 평가 결과 (현재 단일 점수)를 dict로 기록
llm_score = compare_sql_with_llm(
generated_sql, ground_truth_sql, question
)
# evaluation_type별로 점수를 집계
if evaluation_type not in metrics_by_type:
metrics_by_type[evaluation_type] = []
metrics_by_type[evaluation_type].append(llm_score)

feedback_data = {
"question": question,
"generated_sql": generated_sql,
"ground_truth_sql": ground_truth_sql,
"llm_evaluation_metric": llm_score, # 각 쿼리별 metric은 dict 형태로 기록
"execution_time": exec_time,
"evaluation_type": evaluation_type,
}

# 각 샘플 평가를 nested run으로 기록 (run 이름으로 evaluation_type 사용)
with mlflow.start_run(nested=True, run_name=str(evaluation_type)):
log_to_mlflow(
question,
generated_sql,
ground_truth_sql,
llm_score,
evaluation_type,
)
mlflow.log_metric("execution_time", exec_time)

self.dataset.save_feedback(feedback_data)
results.append(feedback_data)

# 각 evaluation_type별로 집계한 metric 계산 (평균, 최고, 최저, 중앙값)
aggregated_metrics = {}
for eval_type, scores in metrics_by_type.items():

for idx, score in enumerate(scores):
mlflow.log_metric(f"{eval_type}", score, step=idx)

# aggregated_metrics를 태그로도 기록 (문자열로 변환)
mlflow.set_tag("aggregated_metrics", str(aggregated_metrics))

return results
13 changes: 13 additions & 0 deletions llm_evaluation/example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"evaluation_type": ["subscription", "hired_sdr", "subscription"],
"inputs": [
"구독서비스를 이용하는 유니크한 유저수",
"회사의 고용된 영업 SDR 수가 궁금해",
"subscription을 시작한 무료 사용자(free_users) 중 각 플랜별 사용자 수를 알고 싶습니다."
],
"ground_truths": [
"SELECT toStartOfMonth(activity_ts) AS month, COUNT(DISTINCT entity_id) AS unique_users FROM client_stream_active_on_subscription WHERE activity_ts >= date_sub(current_date(), interval 3 month) GROUP BY month ORDER BY month",
"SELECT COUNT(DISTINCT entity_id) AS total_hired_sdrs FROM company_stream_hired_sdr WHERE activity_ts >= now() - INTERVAL 3 MONTH",
"SELECT feature_json, COUNT(entity_id) AS user_count FROM client_stream_started_subscription WHERE feature_json LIKE '%\"free_users\"%' GROUP BY feature_json"
]
}
50 changes: 50 additions & 0 deletions llm_evaluation/llm_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
from langchain_core.prompts import ChatPromptTemplate
from llm_utils.llm_factory import get_llm


def compare_sql_with_llm(generated_sql, ground_truth_sql, user_query):
"""LLM을 사용하여 SQL 평가 (0 ~ 1 점수)"""

# LLM 초기화
llm = get_llm(
model_type="openai",
model_name="gpt-4o-mini",
openai_api_key=os.getenv("OPENAI_API_KEY"),
)

# 프롬프트 템플릿 생성
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
f"""
당신은 SQL 전문가입니다. 다음 두 SQL 쿼리의 정확성과 유사성을 비교해주세요.
유사성을 0(완전히 다름)에서 1(동일함) 사이의 척도로 평가해주세요.

입력 설명 (SQL이 수행해야 할 작업):
{user_query}

정답 SQL:
{ground_truth_sql}

생성된 SQL:
{generated_sql}

정답과의 유사성과 생성된 SQL이 입력 설명을 올바르게 처리하는지 모두 고려하세요.
0과 1 사이의 유사성 점수만 반환하고, 소수점 둘째 자리까지 반올림하세요(예: 0.75, 0.42, 1.00).
설명이나 추가 텍스트를 포함하지 마세요.
""",
)
]
)

# LLM 체인 실행
chain = prompt | llm
response = chain.invoke({})

try:
score = float(response.content.strip())
return max(0, min(score, 1)) # 0~1 사이 값으로 정규화
except:
return 0.0 # 오류 발생 시 0점 처리
21 changes: 21 additions & 0 deletions llm_evaluation/mlflow_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import importlib.metadata
import mlflow
from dotenv import load_dotenv

load_dotenv()


def log_to_mlflow(
question, generated_sql, ground_truth_sql, llm_metric, evaluation_type=None
):
"""활성화된 run 내에서 평가 결과 기록

llm_metric은 dict 형태로 기록되며, mlflow.log_param은 내부적으로 문자열로 저장됩니다.
"""
mlflow.log_param("question", question)
mlflow.log_param("generated_sql", generated_sql)
mlflow.log_param("ground_truth_sql", ground_truth_sql)
mlflow.log_param("llm_evaluation_metric", llm_metric)
if evaluation_type is not None:
mlflow.log_param("evaluation_type", evaluation_type)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ streamlit==1.41.1
python-dotenv==1.0.1
faiss-cpu==1.10.0
pre_commit==4.1.0
mlflow==2.21.0
setuptools
wheel
twine
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"streamlit==1.41.1",
"python-dotenv==1.0.1",
"faiss-cpu==1.10.0",
"mlflow==2.21.0",
],
entry_points={
"console_scripts": [
Expand Down