noisebridge · skyfenton · Jan 21, 2025 · Dec 29, 2024 · Jan 1, 2025 · Jan 1, 2025
diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py
diff --git a/mediabridge/db/load.py b/mediabridge/db/load.py
@@ -0,0 +1,36 @@
+import csv
+import dataclasses
+import logging
+
+from typer import Typer
+
+from mediabridge.definitions import OUTPUT_DIR
+from mediabridge.schemas import EnrichedMovieData
+
+log = logging.getLogger(__name__)
+app = Typer()
+
+
+@app.command()
+def load():
+    """
+    Load a csv of movie data into the mongo database.
+    """
+    with open(OUTPUT_DIR / "matches.csv", "r") as f:
+        reader = csv.reader(f)
+
+        header = next(reader)
+        if header != [f.name for f in dataclasses.fields(EnrichedMovieData)]:
+            raise ValueError(
+                "Header does not match expected dataclass fields (EnrichedMovieData), "
+                f"expected {dataclasses.fields(EnrichedMovieData)}, got {header}"
+            )
+
+        for row in reader:
+            movie = EnrichedMovieData(*row)
+            log.info(f"Inserting {movie} into MongoDB")
+            # TODO: Needs implementation, bulk inserts for performance
+
+
+if __name__ == "__main__":
+    app()
diff --git a/mediabridge/db/queries.py b/mediabridge/db/queries.py
@@ -5,9 +5,9 @@ def insert_into_mongo(movie):
     db = connect_to_mongo()
     collection = db["movies"]
     collection.update_one(
-        {"wikidata_id": movie[1]},
+        {"_id": movie[1]},
         {
-            "set": {
+            "$set": {
                 "netflix_id": movie[0],
                 "wikidata_id": movie[1],
                 "title": movie[2],
@@ -18,3 +18,9 @@ def insert_into_mongo(movie):
         },
         upsert=True,
     )
+
+
+def bulk_insert(operations):
+    db = connect_to_mongo()
+    collection = db["movies"]
+    collection.bulk_write(operations)
diff --git a/mediabridge/definitions.py b/mediabridge/definitions.py
@@ -7,8 +7,8 @@
 
 MODULE_DIR = Path(__file__).parent
 PROJECT_DIR = MODULE_DIR.parent
-DATA_DIR = PROJECT_DIR.joinpath("data")
-OUTPUT_DIR = PROJECT_DIR.joinpath("out")
+DATA_DIR = PROJECT_DIR / "data"
+OUTPUT_DIR = PROJECT_DIR / "out"
 
 if __name__ == "__main__":
     print(MODULE_DIR)

diff --git a/mediabridge/main.py b/mediabridge/main.py
@@ -1,42 +1,32 @@
 import logging
-from contextlib import nullcontext
+from dataclasses import dataclass
 from datetime import datetime
 
 import typer as typer
-from tqdm.contrib.logging import logging_redirect_tqdm
 
 from mediabridge.data_processing import wiki_to_netflix
+from mediabridge.db import load
 from mediabridge.definitions import OUTPUT_DIR
 
+app = typer.Typer(no_args_is_help=True, add_completion=False)
+app.add_typer(wiki_to_netflix.app)
+app.add_typer(load.app)
 
+
+@dataclass
+class AppContext:
+    log_to_file: bool = False
+
+
+@app.callback()
 def main(
+    ctx: typer.Context,
     verbose: bool = typer.Option(
         False, "--verbose", "-v", help="Enable verbose logging."
     ),
     log: bool = typer.Option(
         False, "--log", "-l", help="Enable all logging message levels and log to file."
     ),
-    full: bool = typer.Option(
-        False,
-        "--full",
-        "-f",
-        help="Run processing on full dataset. Overrides --num_rows.",
-    ),
-    num_rows: int = typer.Option(
-        100,
-        "--num_rows",
-        "-n",
-        help="Number of rows to process. If --full is True, all rows are processed",
-    ),
-    missing_out_path: str = typer.Option(
-        None,
-        "--missing_out_path",
-        "-m",
-        help=(
-            f"If provided, movies that could not be matched will be written to a "
-            f"CSV at this path, relative to the {OUTPUT_DIR} directory."
-        ),
-    ),
 ):
     if not OUTPUT_DIR.exists():
         print(
@@ -48,9 +38,7 @@ def main(
         # log all messages to new file
         logging.basicConfig(
             level=logging.DEBUG,
-            filename=OUTPUT_DIR.joinpath(
-                f"mb_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
-            ),
+            filename=OUTPUT_DIR / f"mb_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
             filemode="x",
             format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
             datefmt="%H:%M:%S",
@@ -61,22 +49,8 @@ def main(
         else:
             level = logging.WARNING
         logging.basicConfig(level=level, format="[%(levelname)s] %(message)s")
-
-    # We redirect logs to stdout through tqdm to avoid breaking progress bar.
-    # But when logging to file, we use nullcontext or tqdm will redirect logs
-    # back to stdout.
-    with logging_redirect_tqdm() if not log else nullcontext():
-        num_rows = None if full else num_rows
-        try:
-            wiki_to_netflix.process_data(
-                num_rows, output_missing_csv_path=missing_out_path
-            )
-        except Exception as e:
-            # include fatal exceptions with traceback in logs
-            if log:
-                logging.exception("Uncaught exception")
-            raise e
+    ctx.obj = AppContext(log_to_file=log)
 
 
 if __name__ == "__main__":
-    typer.run(main)
+    app()
diff --git a/mediabridge/models/predict.py → mediabridge/recommender/predict.py b/mediabridge/models/predict.py → mediabridge/recommender/predict.py
diff --git a/mediabridge/models/train_model.py → mediabridge/recommender/train_model.py b/mediabridge/models/train_model.py → mediabridge/recommender/train_model.py
diff --git a/mediabridge/models/utils.py → mediabridge/recommender/utils.py b/mediabridge/models/utils.py → mediabridge/recommender/utils.py
diff --git a/mediabridge/schemas.py b/mediabridge/schemas.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(order=True)
+class MovieData:
+    """Dataclass for known data from the Netflix dataset"""
+
+    netflix_id: int
+    title: str
+    year: int
+
+    def flatten_values(self):
+        """Format all dataclass fields into a mapping of strings by joining
+        lists with semicolons"""
+        return {
+            k: (";".join(v) if isinstance(v, list) else str(v))
+            for (k, v) in vars(self).items()
+        }
+
+
+@dataclass(order=True)
+class EnrichedMovieData(MovieData):
+    """Dataclass for enriched data from a Wikidata match"""
+
+    wikidata_id: str
+    genres: Optional[list[str]]
+    director: Optional[str]
+
+
+if __name__ == "__main__":
+    print(
+        EnrichedMovieData(
+            1, "The Matrix", 1999, "Q11424", ["Action", "Drama"], "Lana Wachowski"
+        ).flatten_values()
+    )