Skip to content

Commit

Permalink
Consolidate all files and functions for interaction matrix into a mor…
Browse files Browse the repository at this point in the history
…e reasonable location
  • Loading branch information
audiodude committed Nov 23, 2024
1 parent 6d197bd commit e6d4548
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 63 deletions.
1 change: 0 additions & 1 deletion data_processing/__init__.py

This file was deleted.

14 changes: 0 additions & 14 deletions data_processing/load_data.py

This file was deleted.

37 changes: 0 additions & 37 deletions data_processing/process_data.py

This file was deleted.

8 changes: 0 additions & 8 deletions data_processing/save_data.py

This file was deleted.

1 change: 0 additions & 1 deletion mediabridge/data_processing/build_matrices.py

This file was deleted.

1 change: 0 additions & 1 deletion mediabridge/data_processing/credentials

This file was deleted.

78 changes: 78 additions & 0 deletions mediabridge/data_processing/interaction_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import pickle

import numpy as np


def list_rating_files(directory_path):
"""List of files in the directory that start with mv_."""
return sorted([f for f in os.listdir(directory_path) if f.startswith("mv_")])


def create_interaction_matrix(directory_path, num_users, num_movies, files):
interaction_matrix = np.zeros((num_users, num_movies), dtype=np.int8)
user_mapper = {}
current_user_index = 0

for filename in files:
with open(os.path.join(directory_path, filename), "r") as file:
movie_id = int(file.readline().strip().replace(":", ""))
movie_idx = movie_id - 1

for line in file:
user_id, rating, _ = line.strip().split(",")
user_id = int(user_id)
rating = int(rating)

if rating < 4:
continue

if user_id not in user_mapper:
user_mapper[user_id] = current_user_index
current_user_index += 1

user_idx = user_mapper[user_id]
interaction_matrix[user_idx, movie_idx] = rating

return interaction_matrix


def save_matrix(matrix, output_file):
with open(output_file, "wb") as f:
pickle.dump(matrix, f)
print(f"Interaction matrix saved to {output_file}")


def main():
"""Main entry point to create and save the interaction matrix."""
# from db.mongo_connection import get_db_connection
# from db.movie_storage import insert_movie_data

# Configurations
data_directory = os.path.join(os.path.dirname(__file__), "../../data/")
ratings_directory = os.path.join(data_directory, "training_set/")
output_directory = os.path.join(data_directory, "../output/")
output_file = os.path.join(output_directory, "interaction_matrix.pkl")

# Number of users and movies
num_users = 480189 # Example: Replace with actual value
num_movies = 17770 # Example: Replace with actual value

# Step 1: Load Data
movie_data = list_rating_files(ratings_directory)

# Step 2: Process Data
interaction_matrix = create_interaction_matrix(
data_directory, num_users, num_movies, movie_data
)

# Step 3: Save Data
save_matrix(interaction_matrix, output_file)

# Step 4: Store Movies in MongoDB
# db = get_db_connection(uri=mongo_uri, db_name=db_name)
# insert_movie_data(db, 'movies', movie_data)


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion mediabridge/data_processing/preprocess.py

This file was deleted.

0 comments on commit e6d4548

Please sign in to comment.