Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Easier debugging of SPARQL queries #53

Merged
merged 2 commits into from
Dec 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions mediabridge/data_processing/wiki_to_netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,35 +29,33 @@ class MovieData:
out_dir = os.path.join(os.path.dirname(__file__), "../../out")
user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <audiodude@gmail.com>"

DEFAULT_TEST_ROWS = 100

def read_netflix_txt(txt_file, test):

def read_netflix_txt(txt_file, num_rows=None):
"""
Reads and processes a Netflix text file.

Parameters:
txt_file (str): Path to the Netflix text file
test (Bool): When true, runs the functon in test mode
num_rows (int): Number of rows to read from the file, defaults to all
"""
num_rows = None
if test:
num_rows = 50

with open(txt_file, "r", encoding="ISO-8859-1") as netflix_data:
for i, line in enumerate(netflix_data):
if num_rows is not None and i >= num_rows:
break
yield line.rstrip().split(",", 2)


def create_netflix_csv(csv_name, data_list):
def create_netflix_csv(csv_path, data_list):
"""
Writes data to a Netflix CSV file.

Parameters:
csv_name (str): Name of CSV file to be created
data_list (list): Row of data to be written to CSV file
"""
with open(csv_name, "w") as netflix_csv:
with open(csv_path, "w") as netflix_csv:
csv.writer(netflix_csv).writerows(data_list)


Expand Down Expand Up @@ -101,6 +99,7 @@ def format_sparql_query(title, year):
Returns:
SPARQL Query (str): formatted string with movie title and year
"""

QUERY = """
SELECT * WHERE {
SERVICE wikibase:mwapi {
Expand Down Expand Up @@ -197,7 +196,7 @@ def wiki_query(data_csv, user_agent):

if not data["results"]["bindings"]:
wiki_data_list.append(None)
log.warning(f"Could not find movie id {id} (' {title} ', {year})")
log.warning(f"Could not find movie id {id} ({repr(title)}, {repr(year)})")
else:
wiki_data_list.append(
MovieData(
Expand All @@ -213,28 +212,30 @@ def wiki_query(data_csv, user_agent):
return wiki_data_list


def process_data(test=False):
def process_data(num_rows=None, output_missing_csv_path=None):
"""
Processes Netflix movie data by enriching it with information from Wikidata and writes the results to a CSV file.
Netflix data was conveted from a generator to a list to avoid exaustion. was running into an issue where nothing would print to CSV file

num_rows (int): Number of rows to process. If None, all rows are processed.
output_missing_csv_path (str): If provided, movies that could not be matched will be written to a CSV at this path.
"""
missing_count = 0
processed_data = []
missing = []

netflix_data = list(
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test)
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), num_rows=num_rows)
)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")

enriched_movies = wiki_query(netflix_data, user_agent)

num_rows = len(enriched_movies)

for index, row in enumerate(netflix_data):
netflix_id, year, title = row
movie_data = enriched_movies[index]
# print(index, movie_data)

if movie_data is None:
missing_count += 1
movie = [
Expand All @@ -245,6 +246,7 @@ def process_data(test=False):
"null",
"null",
]
missing.append(movie)
else:
if movie_data.genre:
genres = "; ".join(movie_data.genre)
Expand All @@ -264,7 +266,11 @@ def process_data(test=False):
]
processed_data.append(movie)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")
create_netflix_csv(netflix_csv, processed_data)
if output_missing_csv_path:
missing_csv = os.path.join(out_dir, output_missing_csv_path)
create_netflix_csv(missing_csv, missing)

print(
f"missing: {missing_count} ({missing_count / num_rows * 100:.2f}%)\n"
Expand All @@ -276,4 +282,6 @@ def process_data(test=False):
if __name__ == "__main__":
# Test is true if no argument is passed or if the first argument is not '--prod'.
test = len(sys.argv) < 2 or sys.argv[1] != "--prod"
process_data(test=test)
process_data(
num_rows=DEFAULT_TEST_ROWS if test else None,
)
26 changes: 24 additions & 2 deletions mediabridge/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from contextlib import nullcontext
from datetime import datetime

Expand All @@ -17,7 +18,25 @@ def main(
False, "--log", "-l", help="Enable all logging message levels and log to file."
),
full: bool = typer.Option(
False, "--full", "-f", help="Run processing on full dataset."
False,
"--full",
"-f",
help="Run processing on full dataset. Overrides --num_rows.",
),
num_rows: int = typer.Option(
100,
"--num_rows",
"-n",
help="Number of rows to process. If --full is True, all rows are processed",
),
missing_out_path: str = typer.Option(
None,
"--missing_out_path",
"-m",
help=(
f"If provided, movies that could not be matched will be written to a "
f"CSV at this path, relative to the {os.path.abspath(OUTPUT_DIR)} directory."
),
),
):
if not OUTPUT_DIR.exists():
Expand Down Expand Up @@ -48,8 +67,11 @@ def main(
# But when logging to file, we use nullcontext or tqdm will redirect logs
# back to stdout.
with logging_redirect_tqdm() if not log else nullcontext():
num_rows = None if full else num_rows
try:
wiki_to_netflix.process_data(not full)
wiki_to_netflix.process_data(
num_rows, output_missing_csv_path=missing_out_path
)
except Exception as e:
# include fatal exceptions with traceback in logs
if log:
Expand Down
Loading