Skip to content

Commit

Permalink
Merge pull request #53 from noisebridge/new-cli-args
Browse files Browse the repository at this point in the history
Easier debugging of SPARQL queries
  • Loading branch information
audiodude authored Dec 8, 2024
2 parents 35709da + 82e08fd commit d79c3ab
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 17 deletions.
38 changes: 23 additions & 15 deletions mediabridge/data_processing/wiki_to_netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,35 +29,33 @@ class MovieData:
out_dir = os.path.join(os.path.dirname(__file__), "../../out")
user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <audiodude@gmail.com>"

DEFAULT_TEST_ROWS = 100

def read_netflix_txt(txt_file, test):

def read_netflix_txt(txt_file, num_rows=None):
"""
Reads and processes a Netflix text file.
Parameters:
txt_file (str): Path to the Netflix text file
test (Bool): When true, runs the functon in test mode
num_rows (int): Number of rows to read from the file, defaults to all
"""
num_rows = None
if test:
num_rows = 50

with open(txt_file, "r", encoding="ISO-8859-1") as netflix_data:
for i, line in enumerate(netflix_data):
if num_rows is not None and i >= num_rows:
break
yield line.rstrip().split(",", 2)


def create_netflix_csv(csv_name, data_list):
def create_netflix_csv(csv_path, data_list):
"""
Writes data to a Netflix CSV file.
Parameters:
csv_name (str): Name of CSV file to be created
data_list (list): Row of data to be written to CSV file
"""
with open(csv_name, "w") as netflix_csv:
with open(csv_path, "w") as netflix_csv:
csv.writer(netflix_csv).writerows(data_list)


Expand Down Expand Up @@ -101,6 +99,7 @@ def format_sparql_query(title, year):
Returns:
SPARQL Query (str): formatted string with movie title and year
"""

QUERY = """
SELECT * WHERE {
SERVICE wikibase:mwapi {
Expand Down Expand Up @@ -197,7 +196,7 @@ def wiki_query(data_csv, user_agent):

if not data["results"]["bindings"]:
wiki_data_list.append(None)
log.warning(f"Could not find movie id {id} (' {title} ', {year})")
log.warning(f"Could not find movie id {id} ({repr(title)}, {repr(year)})")
else:
wiki_data_list.append(
MovieData(
Expand All @@ -213,28 +212,30 @@ def wiki_query(data_csv, user_agent):
return wiki_data_list


def process_data(test=False):
def process_data(num_rows=None, output_missing_csv_path=None):
"""
Processes Netflix movie data by enriching it with information from Wikidata and writes the results to a CSV file.
Netflix data was conveted from a generator to a list to avoid exaustion. was running into an issue where nothing would print to CSV file
num_rows (int): Number of rows to process. If None, all rows are processed.
output_missing_csv_path (str): If provided, movies that could not be matched will be written to a CSV at this path.
"""
missing_count = 0
processed_data = []
missing = []

netflix_data = list(
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test)
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), num_rows=num_rows)
)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")

enriched_movies = wiki_query(netflix_data, user_agent)

num_rows = len(enriched_movies)

for index, row in enumerate(netflix_data):
netflix_id, year, title = row
movie_data = enriched_movies[index]
# print(index, movie_data)

if movie_data is None:
missing_count += 1
movie = [
Expand All @@ -245,6 +246,7 @@ def process_data(test=False):
"null",
"null",
]
missing.append(movie)
else:
if movie_data.genre:
genres = "; ".join(movie_data.genre)
Expand All @@ -264,7 +266,11 @@ def process_data(test=False):
]
processed_data.append(movie)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")
create_netflix_csv(netflix_csv, processed_data)
if output_missing_csv_path:
missing_csv = os.path.join(out_dir, output_missing_csv_path)
create_netflix_csv(missing_csv, missing)

print(
f"missing: {missing_count} ({missing_count / num_rows * 100:.2f}%)\n"
Expand All @@ -276,4 +282,6 @@ def process_data(test=False):
if __name__ == "__main__":
# Test is true if no argument is passed or if the first argument is not '--prod'.
test = len(sys.argv) < 2 or sys.argv[1] != "--prod"
process_data(test=test)
process_data(
num_rows=DEFAULT_TEST_ROWS if test else None,
)
26 changes: 24 additions & 2 deletions mediabridge/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from contextlib import nullcontext
from datetime import datetime

Expand All @@ -17,7 +18,25 @@ def main(
False, "--log", "-l", help="Enable all logging message levels and log to file."
),
full: bool = typer.Option(
False, "--full", "-f", help="Run processing on full dataset."
False,
"--full",
"-f",
help="Run processing on full dataset. Overrides --num_rows.",
),
num_rows: int = typer.Option(
100,
"--num_rows",
"-n",
help="Number of rows to process. If --full is True, all rows are processed",
),
missing_out_path: str = typer.Option(
None,
"--missing_out_path",
"-m",
help=(
f"If provided, movies that could not be matched will be written to a "
f"CSV at this path, relative to the {os.path.abspath(OUTPUT_DIR)} directory."
),
),
):
if not OUTPUT_DIR.exists():
Expand Down Expand Up @@ -48,8 +67,11 @@ def main(
# But when logging to file, we use nullcontext or tqdm will redirect logs
# back to stdout.
with logging_redirect_tqdm() if not log else nullcontext():
num_rows = None if full else num_rows
try:
wiki_to_netflix.process_data(not full)
wiki_to_netflix.process_data(
num_rows, output_missing_csv_path=missing_out_path
)
except Exception as e:
# include fatal exceptions with traceback in logs
if log:
Expand Down

0 comments on commit d79c3ab

Please sign in to comment.