Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor main file/commands and expand dataclass usage #61

Merged
merged 35 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
ce48e1a
Move CLI logic for data processing out of main file
skyfenton Dec 29, 2024
1b92193
Fixed comparison to use context object
skyfenton Jan 1, 2025
694bf4f
Removed formatting from help message to make message consistent
skyfenton Jan 1, 2025
5f68d9a
Create dataclasses based on required data attributes
skyfenton Jan 17, 2025
ee48002
add load command to main app cli
skyfenton Jan 17, 2025
8a91ce2
Refactor wiki_to_netflix functions using dataclasses
skyfenton Jan 17, 2025
cfec05c
Rename netflix_csv for readability (also we had duplicate variables)
skyfenton Jan 17, 2025
f6e991f
Change csv output name to distinguish between matches/missing
skyfenton Jan 17, 2025
04ff23d
Load rows from matches.csv to insert into Mongo
skyfenton Jan 17, 2025
066c259
Re-add tqdm and move print message to inside process function
skyfenton Jan 17, 2025
ea1fae2
Fix flatten_values instance check and join
skyfenton Jan 17, 2025
0c12743
Add explicit exc_info parameter for clarity
skyfenton Jan 17, 2025
8eac102
Rename AppContext attr to log_to_file for clarity
skyfenton Jan 17, 2025
4888306
Save log_to_file flag check from ctx.obj
skyfenton Jan 17, 2025
9c08135
Update all docstrings and type hints
skyfenton Jan 18, 2025
6291d9f
Change .joinpath to / for readability
skyfenton Jan 18, 2025
a0d9aa0
Refactor create_netflix_csv to use csv.DictWriter
skyfenton Jan 18, 2025
976f3a8
Set default for user_agent variable
skyfenton Jan 18, 2025
d86c4a2
Make wiki_query more readable without else statement
skyfenton Jan 18, 2025
a9e812c
Change underscores to dashes in options to match typical cli formatting
skyfenton Jan 18, 2025
a99ea1c
Rename dataclasses.py to schemas.py and rename models directory to re…
skyfenton Jan 18, 2025
7c78cc4
Use vars instead of __dict__
skyfenton Jan 18, 2025
9d4b2b6
Add variable to count total (in case num_rows is None)
skyfenton Jan 19, 2025
bca5cab
Fix num_rows type hint
skyfenton Jan 19, 2025
e0a957d
Fix load.py module ordering
skyfenton Jan 19, 2025
c6c09d6
Remove spaces in title outputs and use double quotes for readability
skyfenton Jan 20, 2025
766bf18
Remove unused comment for SPARQL debug
skyfenton Jan 20, 2025
c13aca1
Move schemas.py into movies.py under schemas dir
skyfenton Jan 20, 2025
870165d
Add test for wiki_query
skyfenton Jan 20, 2025
d2f762d
Change to simpler equality check via sorting
skyfenton Jan 20, 2025
ab6f787
Fix grammar
skyfenton Jan 20, 2025
9cf3c63
Clarify how to use the CLI in the README
skyfenton Jan 20, 2025
8dc6c0d
Merge branch 'main' into 58-connect-wiki_to_netflixmongo-insertionmai…
skyfenton Jan 20, 2025
351c256
Simplified testing instructions
skyfenton Jan 20, 2025
e8ff906
Merge branch '58-connect-wiki_to_netflixmongo-insertionmain-file' of …
skyfenton Jan 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
332 changes: 183 additions & 149 deletions mediabridge/data_processing/wiki_to_netflix.py

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions mediabridge/db/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import csv
import dataclasses
import logging

from typer import Typer

from mediabridge.definitions import OUTPUT_DIR
from mediabridge.schemas import EnrichedMovieData

log = logging.getLogger(__name__)
app = Typer()


@app.command()
def load():
"""
Load a csv of movie data into the mongo database.
"""
with open(OUTPUT_DIR / "matches.csv", "r") as f:
reader = csv.reader(f)

header = next(reader)
if header != [f.name for f in dataclasses.fields(EnrichedMovieData)]:
raise ValueError(
"Header does not match expected dataclass fields (EnrichedMovieData), "
f"expected {dataclasses.fields(EnrichedMovieData)}, got {header}"
)

for row in reader:
movie = EnrichedMovieData(*row)
log.info(f"Inserting {movie} into MongoDB")
# TODO: Needs implementation, bulk inserts for performance


if __name__ == "__main__":
app()
10 changes: 8 additions & 2 deletions mediabridge/db/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ def insert_into_mongo(movie):
db = connect_to_mongo()
collection = db["movies"]
collection.update_one(
{"wikidata_id": movie[1]},
{"_id": movie[1]},
audiodude marked this conversation as resolved.
Show resolved Hide resolved
{
"set": {
"$set": {
skyfenton marked this conversation as resolved.
Show resolved Hide resolved
"netflix_id": movie[0],
"wikidata_id": movie[1],
"title": movie[2],
Expand All @@ -18,3 +18,9 @@ def insert_into_mongo(movie):
},
upsert=True,
)


def bulk_insert(operations):
db = connect_to_mongo()
collection = db["movies"]
collection.bulk_write(operations)
4 changes: 2 additions & 2 deletions mediabridge/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

MODULE_DIR = Path(__file__).parent
PROJECT_DIR = MODULE_DIR.parent
DATA_DIR = PROJECT_DIR.joinpath("data")
OUTPUT_DIR = PROJECT_DIR.joinpath("out")
DATA_DIR = PROJECT_DIR / "data"
OUTPUT_DIR = PROJECT_DIR / "out"

if __name__ == "__main__":
print(MODULE_DIR)
Expand Down
58 changes: 16 additions & 42 deletions mediabridge/main.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,32 @@
import logging
from contextlib import nullcontext
from dataclasses import dataclass
from datetime import datetime

import typer as typer
from tqdm.contrib.logging import logging_redirect_tqdm

from mediabridge.data_processing import wiki_to_netflix
from mediabridge.db import load
from mediabridge.definitions import OUTPUT_DIR

app = typer.Typer(no_args_is_help=True, add_completion=False)
app.add_typer(wiki_to_netflix.app)
skyfenton marked this conversation as resolved.
Show resolved Hide resolved
app.add_typer(load.app)


@dataclass
class AppContext:
log_to_file: bool = False


@app.callback()
def main(
ctx: typer.Context,
verbose: bool = typer.Option(
False, "--verbose", "-v", help="Enable verbose logging."
),
log: bool = typer.Option(
False, "--log", "-l", help="Enable all logging message levels and log to file."
),
full: bool = typer.Option(
False,
"--full",
"-f",
help="Run processing on full dataset. Overrides --num_rows.",
),
num_rows: int = typer.Option(
100,
"--num_rows",
"-n",
help="Number of rows to process. If --full is True, all rows are processed",
),
missing_out_path: str = typer.Option(
None,
"--missing_out_path",
"-m",
help=(
f"If provided, movies that could not be matched will be written to a "
f"CSV at this path, relative to the {OUTPUT_DIR} directory."
),
),
):
if not OUTPUT_DIR.exists():
print(
Expand All @@ -48,9 +38,7 @@ def main(
# log all messages to new file
logging.basicConfig(
level=logging.DEBUG,
filename=OUTPUT_DIR.joinpath(
f"mb_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
),
filename=OUTPUT_DIR / f"mb_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
filemode="x",
format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
Expand All @@ -61,22 +49,8 @@ def main(
else:
level = logging.WARNING
logging.basicConfig(level=level, format="[%(levelname)s] %(message)s")

# We redirect logs to stdout through tqdm to avoid breaking progress bar.
# But when logging to file, we use nullcontext or tqdm will redirect logs
# back to stdout.
with logging_redirect_tqdm() if not log else nullcontext():
num_rows = None if full else num_rows
try:
wiki_to_netflix.process_data(
num_rows, output_missing_csv_path=missing_out_path
)
except Exception as e:
# include fatal exceptions with traceback in logs
if log:
logging.exception("Uncaught exception")
raise e
ctx.obj = AppContext(log_to_file=log)


if __name__ == "__main__":
typer.run(main)
app()
File renamed without changes.
File renamed without changes.
36 changes: 36 additions & 0 deletions mediabridge/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from dataclasses import dataclass
from typing import Optional


@dataclass(order=True)
class MovieData:
"""Dataclass for known data from the Netflix dataset"""

netflix_id: int
title: str
year: int

def flatten_values(self):
"""Format all dataclass fields into a mapping of strings by joining
lists with semicolons"""
return {
k: (";".join(v) if isinstance(v, list) else str(v))
for (k, v) in vars(self).items()
}


@dataclass(order=True)
class EnrichedMovieData(MovieData):
"""Dataclass for enriched data from a Wikidata match"""

wikidata_id: str
genres: Optional[list[str]]
director: Optional[str]


if __name__ == "__main__":
print(
EnrichedMovieData(
1, "The Matrix", 1999, "Q11424", ["Action", "Drama"], "Lana Wachowski"
).flatten_values()
)
Loading