Skip to content

Commit

Permalink
Merge pull request #2748 from Arize-ai/migrate
Browse files Browse the repository at this point in the history
feat(persistence): migrations support
  • Loading branch information
mikeldking authored Apr 4, 2024
2 parents d05454b + a4d4c88 commit b56820d
Show file tree
Hide file tree
Showing 12 changed files with 387 additions and 62 deletions.
1 change: 1 addition & 0 deletions cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"respx",
"rgba",
"seafoam",
"sqlalchemy",
"templating",
"tensorboard",
"testset",
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies = [
"openinference-instrumentation-llama-index>=1.2.0",
"openinference-instrumentation-openai>=0.1.4",
"sqlalchemy>=2, <3",
"alembic>=1.3.0, <2",
]
dynamic = ["version"]

Expand Down
3 changes: 3 additions & 0 deletions src/phoenix/db/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .migrate import migrate

__all__ = ["migrate"]
119 changes: 119 additions & 0 deletions src/phoenix/db/alembic.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# A generic, single database configuration.

[alembic]
# path to migration scripts
# Note this is overridden in .migrate during programatic migrations
script_location = migrations

# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
# for all available tokens
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s

# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .

# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =

# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40

# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false

# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false

# version location specification; This defaults
# to migrations/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "version_path_separator" below.
# version_locations = %(here)s/bar:%(here)s/bat:migrations/versions

# version path separator; As mentioned above, this is the character used to split
# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
# Valid values for version_path_separator are:
#
# version_path_separator = :
# version_path_separator = ;
# version_path_separator = space
version_path_separator = os # Use os.pathsep. Default configuration used for new projects.

# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false

# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8

# NB: This is commented out intentionally as it is dynamic
# See migrations/env.py
# sqlalchemy.url = driver://user:pass@localhost/dbname


[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples

# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME

# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = --fix REVISION_SCRIPT_FILENAME

# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic

[handlers]
keys = console

[formatters]
keys = generic

[logger_root]
level = WARN
handlers = console
qualname =

[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine

[logger_alembic]
level = WARN
handlers =
qualname = alembic

[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic

[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
69 changes: 18 additions & 51 deletions src/phoenix/db/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

import numpy as np
from openinference.semconv.trace import SpanAttributes
from sqlalchemy import Engine, create_engine, event
from sqlalchemy import Engine, create_engine, event, insert
from sqlalchemy.orm import sessionmaker

from phoenix.db.models import Base, Trace
from phoenix.db.models import Base, Project, Trace
from phoenix.trace.schemas import (
ComputedValues,
Span,
Expand All @@ -21,6 +21,8 @@
SpanStatusCode,
)

from .migrate import migrate

_CONFIG = """
PRAGMA foreign_keys = ON;
PRAGMA journal_mode = WAL;
Expand All @@ -29,50 +31,6 @@
PRAGMA busy_timeout = 10000;
"""

_INIT_DB = """
BEGIN;
CREATE TABLE projects (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
description TEXT,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
INSERT INTO projects(name) VALUES('default');
CREATE TABLE traces (
id INTEGER PRIMARY KEY,
trace_id TEXT UNIQUE NOT NULL,
project_rowid INTEGER NOT NULL,
session_id TEXT,
start_time TIMESTAMP NOT NULL,
end_time TIMESTAMP NOT NULL,
FOREIGN KEY(project_rowid) REFERENCES projects(id)
);
CREATE INDEX idx_trace_start_time ON traces(start_time);
CREATE TABLE spans (
id INTEGER PRIMARY KEY,
span_id TEXT UNIQUE NOT NULL,
trace_rowid INTEGER NOT NULL,
parent_span_id TEXT,
kind TEXT NOT NULL,
name TEXT NOT NULL,
start_time TIMESTAMP NOT NULL,
end_time TIMESTAMP NOT NULL,
attributes JSON,
events JSON,
status TEXT CHECK(status IN ('UNSET','OK','ERROR')) NOT NULL DEFAULT('UNSET'),
status_message TEXT,
latency_ms REAL,
cumulative_error_count INTEGER NOT NULL DEFAULT 0,
cumulative_llm_token_count_prompt INTEGER NOT NULL DEFAULT 0,
cumulative_llm_token_count_completion INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY(trace_rowid) REFERENCES traces(id)
);
CREATE INDEX idx_parent_span_id ON spans(parent_span_id);
PRAGMA user_version = 1;
COMMIT;
"""


_MEM_DB_STR = "file::memory:?cache=shared"

Expand Down Expand Up @@ -105,19 +63,28 @@ def __init__(self, db_path: Optional[Path] = None) -> None:
# self.con.set_trace_callback(print)
cur = self.con.cursor()
cur.executescript(_CONFIG)
if int(cur.execute("PRAGMA user_version;").fetchone()[0]) < 1:
cur.executescript(_INIT_DB)

db_url = f"sqlite:///{db_path}" if db_path else "sqlite:///:memory:"
engine = (
create_engine(f"sqlite:///{db_path}", echo=True)
create_engine(db_url, echo=True)
if db_path
else create_engine(
"sqlite:///:memory:",
db_url,
echo=True,
creator=_mem_db_creator,
)
)
Base.metadata.create_all(engine)

# TODO this should be moved out
if db_path:
migrate(db_url)
else:
Base.metadata.create_all(engine)
# Create the default project and setup indexes
with engine.connect() as conn:
conn.execute(insert(Project).values(name="default", description="default project"))
conn.commit()

self.Session = sessionmaker(bind=engine)

def insert_span(self, span: Span, project_name: str) -> None:
Expand Down
27 changes: 27 additions & 0 deletions src/phoenix/db/migrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
from pathlib import Path

from alembic import command
from alembic.config import Config

logger = logging.getLogger(__name__)


def migrate(url: str) -> None:
"""
Runs migrations on the database.
NB: Migrate only works on non-memory databases.
Args:
url: The database URL.
"""
logger.warning("Running migrations on the database")
config_path = str(Path(__file__).parent.resolve() / "alembic.ini")
alembic_cfg = Config(config_path)

# Explicitly set the migration directory
scripts_location = str(Path(__file__).parent.resolve() / "migrations")
alembic_cfg.set_main_option("script_location", scripts_location)
alembic_cfg.set_main_option("sqlalchemy.url", url)

command.upgrade(alembic_cfg, "head")
1 change: 1 addition & 0 deletions src/phoenix/db/migrations/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Generic single-database configuration.
74 changes: 74 additions & 0 deletions src/phoenix/db/migrations/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from logging.config import fileConfig

from alembic import context
from phoenix.db.models import Base
from sqlalchemy import engine_from_config, pool

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config

# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)

# add your model's MetaData object here
# for 'autogenerate' support

target_metadata = Base.metadata

# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.


def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)

with context.begin_transaction():
context.run_migrations()


def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)

with connectable.connect() as connection:
context.configure(connection=connection, target_metadata=target_metadata)

with context.begin_transaction():
context.run_migrations()


if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
26 changes: 26 additions & 0 deletions src/phoenix/db/migrations/script.py.mako
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""${message}

Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
${imports if imports else ""}

# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}


def upgrade() -> None:
${upgrades if upgrades else "pass"}


def downgrade() -> None:
${downgrades if downgrades else "pass"}
Loading

0 comments on commit b56820d

Please sign in to comment.