Rimember · Jungjihyuk · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -24,6 +24,15 @@ __pycache__/
 
 # logging
 *.log
+logs
 
 # security
-*.pem
+*.pem
+
+# loki 
+chunks
+compactor
+rules
+tsdb-shipper-active
+tsdb-shipper-cache
+wal
diff --git a/backend/alembic/env.py b/backend/alembic/env.py
@@ -18,7 +18,7 @@
 DATABASE_URL = get_env("DATABASE_URL")
 
 if not DATABASE_URL:
-    raise ValueError(f"No database URL configured for the {get_env("ENVIRONMENT")} environment.")
+    raise ValueError(f"No database URL configured for the {get_env('ENVIRONMENT')} environment.")
 
 config = context.config
 config.set_main_option("sqlalchemy.url", DATABASE_URL)

diff --git a/backend/app/db/database.py b/backend/app/db/database.py
@@ -1,3 +1,4 @@
+import asyncpg
 from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.ext.declarative import declarative_base
@@ -33,3 +34,14 @@ async def create_db_session():
 
 async def close_db_session(session):
     await session.close()
+
+
+async def get_connection():
+    """PostgreSQL 데이터베이스에 직접 연결하고 연결 객체를 반환"""
+    conn = await asyncpg.connect(DATABASE_URL.replace('postgresql+asyncpg', 'postgresql'))
+    return conn
+
+
+async def close_connection(conn):
+    """PostgreSQL 연결 종료"""
+    await conn.close()
diff --git a/backend/app/llm/langchain.py b/backend/app/llm/langchain.py
@@ -16,6 +16,7 @@
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 from app.llm.message_task import store
+from app.utils.logging import logger
 
 
 def get_rag_chain(document_text: str):
@@ -115,6 +116,7 @@ def get_langchain_response(user_message: str, chat_room_id: int):
         )
 
     except Exception as e:
+        logger.error("Error generating answer: ", exc_info={e})
         raise RuntimeError(f"Error generating answer: {e}")
 
 

diff --git a/backend/app/main.py b/backend/app/main.py
@@ -1,10 +1,12 @@
+import time
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 from app.utils.logging import log_exception
-from app.routers import auth, me, folders, chatrooms, messages, links, rating
-
+from app.routers import auth, me, folders, chatrooms, messages, links, rating, metrics
+from app.metrics.prometheus_metrics import REQUEST_COUNT, REQUEST_DURATION, RESPONSE_STATUS
+from prometheus_fastapi_instrumentator import Instrumentator 
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -17,6 +19,7 @@ async def lifespan(app: FastAPI):
 
 origins = [
     "http://localhost:3000",
+    "http://localhost:3001",
 ]
 
 app.add_middleware(
@@ -27,6 +30,35 @@ async def lifespan(app: FastAPI):
     allow_headers=["*"],
 )
 
+instrumentator = Instrumentator().instrument(app)
+instrumentator.expose(app)
+
+@app.middleware("http")
+async def metrics_middleware(request: Request, call_next):
+    start_time = time.time()
+    response = await call_next(request)
+    process_time = time.time() - start_time
+
+    # 요청 처리 시간 기록
+    response.headers["X-Process-Time"] = str(process_time)
+
+    # 요청 메트릭을 수집 (HTTP 메서드, 요청 URL)
+    method = request.method
+    path = request.url.path
+
+    # 메트릭 수집
+    method = request.method
+    path = request.url.path
+    status_code = response.status_code
+
+    REQUEST_COUNT.labels(method=method, path=path).inc()
+    REQUEST_DURATION.labels(method=method, path=path).observe(process_time)
+    RESPONSE_STATUS.labels(method=method, path=path, status_code=status_code).inc()
+
+    response.headers["X-Process-Time"] = str(process_time)
+
+    return response
+
 
 @app.exception_handler(HTTPException)
 async def custom_http_exception_handler(req: Request, exc: HTTPException):
@@ -46,3 +78,4 @@ async def custom_http_exception_handler(req: Request, exc: HTTPException):
 app.include_router(messages.router)
 app.include_router(links.router)
 app.include_router(rating.router)
+app.include_router(metrics.router)
diff --git a/backend/app/metrics/pg_metrics.py b/backend/app/metrics/pg_metrics.py
@@ -0,0 +1,108 @@
+from prometheus_client import Gauge
+from app.db.database import get_connection, close_connection
+from app.metrics.prometheus_metrics import *
+
+async def record_pg_metrics():
+    # Connect to PostgreSQL
+    conn = await get_connection()
+    try:
+        # Total transactions 
+        commit_result = await conn.fetchval("SELECT xact_commit FROM pg_stat_database WHERE datname = current_database();")
+        COMMIT_COUNT.set(commit_result)
+
+        rollback_result = await conn.fetchval("SELECT xact_rollback FROM pg_stat_database WHERE datname = current_database();")
+        ROLLBACK_COUNT.set(rollback_result)
+
+        # Active connections
+        active_conn_result = await conn.fetchval("SELECT count(*) FROM pg_stat_activity WHERE state = 'active';")
+        ACTIVE_CONNECTIONS.set(active_conn_result)
+
+        # Total cache hit ration 
+        cache_hit_result = await conn.fetchval("""
+                SELECT 
+                    CASE 
+                        WHEN SUM(heap_blks_hit + heap_blks_read) = 0 THEN 0
+                        ELSE 100 * SUM(heap_blks_hit) / SUM(heap_blks_hit + heap_blks_read)
+                    END AS overall_cache_hit_ratio
+                FROM pg_statio_user_tables
+                WHERE (heap_blks_hit + heap_blks_read) > 0;
+        """)
+        if cache_hit_result is not None: 
+            TOTAL_CACHE_HIT_RATIO.set(cache_hit_result)
+        else: 
+            TOTAL_CACHE_HIT_RATIO.set(0)
+
+        # Cache hit ratio 
+        cache_hit_results = await conn.fetch("""
+            SELECT relname,
+                   heap_blks_hit,
+                   heap_blks_read,
+                   CASE 
+                       WHEN (heap_blks_hit + heap_blks_read) = 0 THEN 0
+                       ELSE 100 * heap_blks_hit / (heap_blks_hit + heap_blks_read)
+                   END AS cache_hit_ratio
+            FROM pg_statio_user_tables;
+        """)
+
+        for result in cache_hit_results:
+            table_name = result['relname']
+            cache_hit_ratio = result['cache_hit_ratio']
+
+            # If the metric for this table doesn't exist yet, create it
+            if table_name not in CACHE_HIT_RATIO_METRICS:
+                CACHE_HIT_RATIO_METRICS[table_name] = Gauge(f'cache_hit_ratio_{table_name}', f'Cache hit ratio for {table_name} table')
+
+            # Set the value of the metric to the cache hit ratio
+            CACHE_HIT_RATIO_METRICS[table_name].set(cache_hit_ratio)
+
+        # Index scan ratio (with division by zero and None protection)
+        index_scan_result = await conn.fetchval("""
+            SELECT CASE 
+                     WHEN (sum(seq_scan) + sum(idx_scan)) = 0 THEN 0
+                     ELSE sum(idx_scan) / (sum(seq_scan) + sum(idx_scan))
+                   END AS index_scan_ratio
+            FROM pg_stat_user_tables;
+        """)
+
+        # Handle None case for index scan ratio
+        INDEX_SCAN_RATIO.set(index_scan_result)
+
+        # Fetch, Insert, Update, Delete rates
+        rates_result = await conn.fetchrow("""
+            SELECT tup_fetched, tup_inserted, tup_updated, tup_deleted 
+            FROM pg_stat_database WHERE datname = current_database();
+        """)
+        FETCH_RATE.set(rates_result['tup_fetched'])
+        INSERT_RATE.set(rates_result['tup_inserted'])
+        UPDATE_RATE.set(rates_result['tup_updated'])
+        DELETE_RATE.set(rates_result['tup_deleted'])
+
+        # Deadlock count
+        deadlock_result = await conn.fetchval("""
+            SELECT deadlocks FROM pg_stat_database WHERE datname = current_database();
+        """)
+        DEADLOCK_COUNT.set(deadlock_result)
+
+        # Replication lag (in bytes)
+        replication_lag_result = await conn.fetchval("""
+            SELECT pg_current_wal_lsn() - replay_lsn AS replication_lag_bytes 
+            FROM pg_stat_replication;
+        """)
+
+        # Handle None case for replication lag
+        if replication_lag_result is not None:
+            REPLICATION_LAG_BYTES.set(replication_lag_result)
+        else:
+            REPLICATION_LAG_BYTES.set(0)  # Set to 0 if no 
+
+
+        # Data transfer volume 
+        disk_result = await conn.fetchval("SELECT blks_read FROM pg_stat_database WHERE datname = current_database();")
+        READ_DISK.set(disk_result)
+
+        cache_result = await conn.fetchval("SELECT blks_hit FROM pg_stat_database WHERE datname = current_database();")
+        READ_CACHE.set(cache_result)    
+
+
+    finally:
+        await close_connection(conn)
diff --git a/backend/app/metrics/prometheus_metrics.py b/backend/app/metrics/prometheus_metrics.py
@@ -0,0 +1,23 @@
+from prometheus_client import Counter, Histogram, Gauge
+
+# Prometheus 메트릭 정의 (backend)
+REQUEST_COUNT = Counter("request_count", "Total request count", ["method", "path"])
+REQUEST_DURATION = Histogram("request_duration_seconds", "Request duration", ["method", "path"])
+RESPONSE_STATUS = Counter('http_response_status', 'HTTP Response Status Codes', ['method', 'path', 'status_code'])
+
+# Prometheus 메트릭 정의 (postgres)
+ACTIVE_CONNECTIONS = Gauge('pg_active_connections', 'Number of active connections')
+TOTAL_CACHE_HIT_RATIO = Gauge('pg_total_cache_hit_ratio', 'Total cache hit ratio')
+INDEX_SCAN_RATIO = Gauge('pg_index_scan_ratio', 'Index scan ratio')
+FETCH_RATE = Gauge('pg_fetch_rate', 'Fetch throughput')
+INSERT_RATE = Gauge('pg_insert_rate', 'Insert throughput')
+UPDATE_RATE = Gauge('pg_update_rate', 'Update throughput')
+DELETE_RATE = Gauge('pg_delete_rate', 'Delete throughput')
+DEADLOCK_COUNT = Gauge('pg_deadlock_count', 'Number of deadlocks')
+REPLICATION_LAG_BYTES = Gauge('pg_replication_lag_bytes', 'Replication lag in bytes')
+COMMIT_COUNT = Gauge('pg_stat_database_xact_commit', 'Number of commit transactions')
+ROLLBACK_COUNT = Gauge('pg_stat_database_xact_rollback', 'Number of rollback transactions')
+READ_DISK = Gauge('pg_stat_database_blks_read', 'Number of blocks read from disk')
+READ_CACHE = Gauge('pg_stat_database_blks_hit', 'Number of blocks read from cache')
+
+CACHE_HIT_RATIO_METRICS = {}