diff --git a/.gitignore b/.gitignore
index 203a6db..38dd159 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@ __pycache__/
 build/
 lang2sql.egg-info/
 dist/
-.pypirc
\ No newline at end of file
+.pypirc
+mlruns
+table_info_db
\ No newline at end of file
diff --git a/cli/__init__.py b/cli/__init__.py
index 1fd9fec..d339cf2 100644
--- a/cli/__init__.py
+++ b/cli/__init__.py
@@ -1,6 +1,10 @@
 import click
 import subprocess
 from llm_utils.tools import set_gms_server
+from langchain_core.messages import HumanMessage
+from llm_evaluation.evaluator import Evaluator
+from llm_utils.graph import builder
+import os
 
 
 @click.group()
@@ -11,12 +15,17 @@
 )
 @click.option("--run-streamlit", is_flag=True, help="Run the Streamlit app.")
 @click.option("-p", "--port", type=int, default=8501, help="Streamlit port")
-def cli(ctx, datahub_server, run_streamlit, port):
+@click.option("--mlflow-tracking-uri", default=None, help="MLflow 트래킹 서버 URI")
+def cli(ctx, datahub_server, run_streamlit, port, mlflow_tracking_uri):
     try:
         set_gms_server(datahub_server)
     except ValueError as e:
         click.echo(str(e))
         ctx.exit(1)
+
+    if mlflow_tracking_uri:
+        os.environ["MLFLOW_TRACKING_URI"] = mlflow_tracking_uri
+
     if run_streamlit:
         run_streamlit_command(port)
 
@@ -33,3 +42,31 @@ def run_streamlit_command(port):
 def run_streamlit(port):
     """Run the Streamlit app."""
     run_streamlit_command(port)
+
+
+@cli.command()
+@click.argument("dataset_path", type=click.Path(exists=True))
+@click.option(
+    "--user-database-env", default="clickhouse", help="사용자 데이터베이스 환경"
+)
+def evaluate(dataset_path, user_database_env):
+    """SQL 생성 모델을 평가합니다."""
+    click.echo(f"데이터셋 {dataset_path}로 평가를 시작합니다...")
+
+    evaluator = Evaluator(dataset_path)
+
+    def generated_sql_fn(question: str):
+        graph = builder.compile()
+
+        res = graph.invoke(
+            input={
+                "messages": [HumanMessage(content=question)],
+                "user_database_env": user_database_env,
+                "best_practice_query": "",
+            }
+        )
+
+        return res["generated_query"].content
+
+    results = evaluator.evaluate(generated_sql_fn)
+    click.echo(f"평가 완료! {len(results)}개 쿼리 평가됨")
diff --git a/docs/evaluation.md b/docs/evaluation.md
new file mode 100644
index 0000000..ebb3ff3
--- /dev/null
+++ b/docs/evaluation.md
@@ -0,0 +1,24 @@
+# Evaluation
+
+## MLFlow
+
+### 실행
+
+- `docker run --name mlflow_postgres -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=mlflow_db -p 5430:5432 -d postgres`
+- `mlflow server --backend-store-uri postgresql://postgres:postgres@localhost:5430/mlflow_db --default-artifact-root ./mlruns --host 0.0.0.0 --port 5000`
+  - `backed-store-uri`: MLflow가 메타데이터를 저장하는 데이터베이스의 URI를 지정합니다. 예를 들어, PostgreSQL 데이터베이스를 사용할 경우 `postgresql://<username>:<password>@<hostname>:<port>/<database_name>` 형식으로 지정합니다.
+  - `default-artifact-root`: MLflow가 모델 아티팩트를 저장할 기본 경로를 지정합니다. 예를 들어, 로컬 파일 시스템을 사용할 경우 `./mlruns`와 같이 지정할 수 있습니다.
+
+
+## 평가 실행하기
+
+### CLI 명령어
+
+SQL 생성 모델을 평가하기 위해 다음과 같은 CLI 명령어를 사용할 수 있습니다:
+```
+lang2sql evaluate /path/to/dataset.json --user-database-env clickhouse
+```
+
+### 결과 예시
+
+![MLFlow](./mlflow.png)
diff --git a/docs/mlflow.png b/docs/mlflow.png
new file mode 100644
index 0000000..7fad7e3
Binary files /dev/null and b/docs/mlflow.png differ
diff --git a/llm_evaluation/__init__.py b/llm_evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_evaluation/dataset.py b/llm_evaluation/dataset.py
new file mode 100644
index 0000000..29967a2
--- /dev/null
+++ b/llm_evaluation/dataset.py
@@ -0,0 +1,45 @@
+import json
+import pandas as pd
+import os
+
+
+class QADataset:
+    def __init__(self, dataset_path):
+        self.dataset_path = dataset_path
+        self.data = self._load_dataset()
+        self.results_path = dataset_path.replace(".json", "_results.json")
+
+    def _load_dataset(self):
+        """JSON 파일에서 Question-Answer 데이터셋을 로드"""
+        with open(self.dataset_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if isinstance(data, list):
+            return pd.DataFrame(data)
+        elif isinstance(data, dict):
+            return pd.DataFrame.from_dict(data)
+        else:
+            raise ValueError(
+                "지원되지 않는 JSON 형식입니다. 리스트 또는 딕셔너리 형식이어야 합니다."
+            )
+
+    def get_samples(self):
+        """원본 데이터셋의 질문, 정답 SQL, 평가 타입(evaluation_type) 정보를 반환"""
+        if "evaluation_type" in self.data.columns:
+            eval_types = self.data["evaluation_type"].tolist()
+        else:
+            eval_types = [None] * len(self.data["inputs"])
+
+        for i in range(len(self.data["inputs"])):
+            yield self.data["inputs"][i], self.data["ground_truths"][i], eval_types[i]
+
+    def save_feedback(self, feedback_data):
+        """평가 결과를 별도 파일에 저장"""
+        results = []
+        if os.path.exists(self.results_path):
+            with open(self.results_path, "r", encoding="utf-8") as f:
+                results = json.load(f)
+
+        results.append(feedback_data)
+
+        with open(self.results_path, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=4)
diff --git a/llm_evaluation/evaluator.py b/llm_evaluation/evaluator.py
new file mode 100644
index 0000000..c15ceca
--- /dev/null
+++ b/llm_evaluation/evaluator.py
@@ -0,0 +1,87 @@
+import time
+import mlflow
+import os
+import importlib.metadata
+from langchain_core.messages import HumanMessage
+
+import numpy as np
+from llm_evaluation.dataset import QADataset
+from llm_evaluation.llm_evaluator import compare_sql_with_llm
+from llm_evaluation.mlflow_logger import log_to_mlflow
+
+
+class Evaluator:
+    def __init__(self, dataset_path):
+        self.dataset = QADataset(dataset_path)
+
+    def evaluate(self, generated_sql_fn):
+        """Lang2SQL 평가 함수 (사용자가 SQL 생성 함수를 제공)
+
+        각 평가 샘플은 nested run으로 기록됩니다.
+        """
+        results = []
+        metrics_by_type = {}  # evaluation_type별 점수를 저장할 dict
+
+        # MLflow 설정: tracking URI와 experiment 이름 설정
+        mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
+        try:
+            lang2sql_version = importlib.metadata.version("lang2sql")
+        except importlib.metadata.PackageNotFoundError:
+            lang2sql_version = "unknown"
+        experiment_name = f"lang2sql-evaluation-v{lang2sql_version}"
+        mlflow.set_experiment(experiment_name)
+
+        # 전체 평가를 하나의 부모 run으로 감싸기
+        with mlflow.start_run(run_name="evaluation_run") as parent_run:
+            for (
+                question,
+                ground_truth_sql,
+                evaluation_type,
+            ) in self.dataset.get_samples():
+                start_time = time.time()
+                generated_sql = generated_sql_fn(question)
+                exec_time = time.time() - start_time
+
+                # LLM 평가 결과 (현재 단일 점수)를 dict로 기록
+                llm_score = compare_sql_with_llm(
+                    generated_sql, ground_truth_sql, question
+                )
+                # evaluation_type별로 점수를 집계
+                if evaluation_type not in metrics_by_type:
+                    metrics_by_type[evaluation_type] = []
+                metrics_by_type[evaluation_type].append(llm_score)
+
+                feedback_data = {
+                    "question": question,
+                    "generated_sql": generated_sql,
+                    "ground_truth_sql": ground_truth_sql,
+                    "llm_evaluation_metric": llm_score,  # 각 쿼리별 metric은 dict 형태로 기록
+                    "execution_time": exec_time,
+                    "evaluation_type": evaluation_type,
+                }
+
+                # 각 샘플 평가를 nested run으로 기록 (run 이름으로 evaluation_type 사용)
+                with mlflow.start_run(nested=True, run_name=str(evaluation_type)):
+                    log_to_mlflow(
+                        question,
+                        generated_sql,
+                        ground_truth_sql,
+                        llm_score,
+                        evaluation_type,
+                    )
+                    mlflow.log_metric("execution_time", exec_time)
+
+                self.dataset.save_feedback(feedback_data)
+                results.append(feedback_data)
+
+            # 각 evaluation_type별로 집계한 metric 계산 (평균, 최고, 최저, 중앙값)
+            aggregated_metrics = {}
+            for eval_type, scores in metrics_by_type.items():
+
+                for idx, score in enumerate(scores):
+                    mlflow.log_metric(f"{eval_type}", score, step=idx)
+
+            # aggregated_metrics를 태그로도 기록 (문자열로 변환)
+            mlflow.set_tag("aggregated_metrics", str(aggregated_metrics))
+
+        return results
diff --git a/llm_evaluation/example.json b/llm_evaluation/example.json
new file mode 100644
index 0000000..b6c0602
--- /dev/null
+++ b/llm_evaluation/example.json
@@ -0,0 +1,13 @@
+{
+    "evaluation_type": ["subscription", "hired_sdr", "subscription"],
+    "inputs": [
+        "구독서비스를 이용하는 유니크한 유저수",
+        "회사의 고용된 영업 SDR 수가 궁금해",
+        "subscription을 시작한 무료 사용자(free_users) 중 각 플랜별 사용자 수를 알고 싶습니다."
+    ],
+    "ground_truths": [
+        "SELECT toStartOfMonth(activity_ts) AS month, COUNT(DISTINCT entity_id) AS unique_users FROM client_stream_active_on_subscription WHERE activity_ts >= date_sub(current_date(), interval 3 month) GROUP BY month ORDER BY month",
+        "SELECT COUNT(DISTINCT entity_id) AS total_hired_sdrs FROM company_stream_hired_sdr WHERE activity_ts >= now() - INTERVAL 3 MONTH",
+        "SELECT feature_json, COUNT(entity_id) AS user_count FROM client_stream_started_subscription WHERE feature_json LIKE '%\"free_users\"%' GROUP BY feature_json"
+    ]
+}
diff --git a/llm_evaluation/llm_evaluator.py b/llm_evaluation/llm_evaluator.py
new file mode 100644
index 0000000..9ce1589
--- /dev/null
+++ b/llm_evaluation/llm_evaluator.py
@@ -0,0 +1,50 @@
+import os
+from langchain_core.prompts import ChatPromptTemplate
+from llm_utils.llm_factory import get_llm
+
+
+def compare_sql_with_llm(generated_sql, ground_truth_sql, user_query):
+    """LLM을 사용하여 SQL 평가 (0 ~ 1 점수)"""
+
+    # LLM 초기화
+    llm = get_llm(
+        model_type="openai",
+        model_name="gpt-4o-mini",
+        openai_api_key=os.getenv("OPENAI_API_KEY"),
+    )
+
+    # 프롬프트 템플릿 생성
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                f"""
+            당신은 SQL 전문가입니다. 다음 두 SQL 쿼리의 정확성과 유사성을 비교해주세요.
+            유사성을 0(완전히 다름)에서 1(동일함) 사이의 척도로 평가해주세요.
+            
+            입력 설명 (SQL이 수행해야 할 작업):
+            {user_query}
+            
+            정답 SQL:
+            {ground_truth_sql}
+
+            생성된 SQL:
+            {generated_sql}
+
+            정답과의 유사성과 생성된 SQL이 입력 설명을 올바르게 처리하는지 모두 고려하세요.
+            0과 1 사이의 유사성 점수만 반환하고, 소수점 둘째 자리까지 반올림하세요(예: 0.75, 0.42, 1.00).
+            설명이나 추가 텍스트를 포함하지 마세요.
+            """,
+            )
+        ]
+    )
+
+    # LLM 체인 실행
+    chain = prompt | llm
+    response = chain.invoke({})
+
+    try:
+        score = float(response.content.strip())
+        return max(0, min(score, 1))  # 0~1 사이 값으로 정규화
+    except:
+        return 0.0  # 오류 발생 시 0점 처리
diff --git a/llm_evaluation/mlflow_logger.py b/llm_evaluation/mlflow_logger.py
new file mode 100644
index 0000000..c9f32fa
--- /dev/null
+++ b/llm_evaluation/mlflow_logger.py
@@ -0,0 +1,21 @@
+import os
+import importlib.metadata
+import mlflow
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def log_to_mlflow(
+    question, generated_sql, ground_truth_sql, llm_metric, evaluation_type=None
+):
+    """활성화된 run 내에서 평가 결과 기록
+
+    llm_metric은 dict 형태로 기록되며, mlflow.log_param은 내부적으로 문자열로 저장됩니다.
+    """
+    mlflow.log_param("question", question)
+    mlflow.log_param("generated_sql", generated_sql)
+    mlflow.log_param("ground_truth_sql", ground_truth_sql)
+    mlflow.log_param("llm_evaluation_metric", llm_metric)
+    if evaluation_type is not None:
+        mlflow.log_param("evaluation_type", evaluation_type)
diff --git a/requirements.txt b/requirements.txt
index 2c506a8..7b17b47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ streamlit==1.41.1
 python-dotenv==1.0.1
 faiss-cpu==1.10.0
 pre_commit==4.1.0
+mlflow==2.21.0
 setuptools
 wheel
 twine
diff --git a/setup.py b/setup.py
index d5e4805..bbf59f4 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,7 @@
         "streamlit==1.41.1",
         "python-dotenv==1.0.1",
         "faiss-cpu==1.10.0",
+        "mlflow==2.21.0",
     ],
     entry_points={
         "console_scripts": [