-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
204 lines (160 loc) · 6.83 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# -*- coding: utf-8 -*-
from typing import List
from commons.RatingPredictor import RatingPredictor
from collaborative_filtering.RowPearsonSimilarityMatrix import RowPearsonSimilarityMatrix
from collaborative_filtering.clustering.ClusterCollaborativeFiltering import ClusterCollaborativeFiltering
from collaborative_filtering.global_baseline.ItemGlobalBaselineCollaborativeFiltering import \
ItemGlobalBaselineCollaborativeFiltering
from collaborative_filtering.global_baseline.UserGlobalBaselineCollaborativeFiltering import \
UserGlobalBaselineCollaborativeFiltering
from collaborative_filtering.naive.ItemNaiveCollaborativeFiltering import ItemNaiveCollaborativeFiltering
from collaborative_filtering.naive.UserNaiveCollaborativeFiltering import UserNaiveCollaborativeFiltering
from data_handling.DataLoader import DataLoader
from data_handling.DataPathProvider import DataPathProvider
from data_handling.DiskPersistor import DiskPersistor
from commons.FormulaFactory import FormulaFactory
from commons.FormulaFactory import ScoringMeasureType
from matrix_factorization.UvDecomposition import UvDecomposer
from matrix_factorization.RegularizedUvDecompositon import RegularizedUvDecomposer
from matrix_factorization.BiasUvDecomposition import BiasUvDecomposer
from data_handling.LocalFileCsvProvider import LocalFileCsvProvider
"""
FRAMEWORK FOR DATAMINING CLASS
#### IDENTIFICATION
NAME: Pietro, Catalin
SURNAME: Vigilanza, Lupau
STUDENT ID: -, 5042143
KAGGLE ID: -, C.P.Lupau@student.tudelft.nl
### NOTES
This files is an example of what your code should look like.
To know more about the expectations, please refer to the guidelines.
"""
#####
##
## DATA IMPORT
##
#####
# Where data is located
movies_file = './data/movies.csv'
users_file = './data/users.csv'
ratings_file = './data/ratings.csv'
predictions_file = './data/predictions.csv'
submission_file = './data/submissions/submission.csv'
# Create a data path provider
data_path_provider = DataPathProvider(movies_path=movies_file, users_path=users_file, ratings_path=ratings_file, predictions_path=predictions_file, submission_path=submission_file)
# Creata a data loader
data_loader = DataLoader(data_path_provider=data_path_provider, csv_provider=LocalFileCsvProvider())
disk_persistor = DiskPersistor()
formula_factory = FormulaFactory()
# Create the user similarity matrix matrix if not already created
sym_matrix_results = disk_persistor.perist_computation([
(lambda: RowPearsonSimilarityMatrix(data_loader.get_ratings_matrix()), 'global_pearson_similarity_matrix'),
(lambda: RowPearsonSimilarityMatrix(data_loader.get_ratings_matrix().T), 'global_pearson_similarity_matrix_movie')
], force_update=False)
global_pearson_similarity_matrix_user = sym_matrix_results[0]
global_pearson_similarity_matrix_movie = sym_matrix_results[1]
#Generate formular factory and True RMSE score
formula_factory = FormulaFactory()
scoring_measure = ScoringMeasureType.BIAS_TRUE_RMSE
def predict(predictor: RatingPredictor, force_update: bool, weights: List[float]):
predictor.perform_precomputations(force_update=force_update)
predictions = predictor.make_average_prediction(weights=weights).values()
predictions = list(predictions)
number_predictions = len(predictions)
return [[idx, predictions[idx - 1]] for idx in range(1, number_predictions + 1)]
#####
##
## SAVE RESULTS
##
#####
def predict_and_write_to_file(predictor: RatingPredictor, force_update: bool, weights: List[float], submission_file: str):
## //!!\\ TO CHANGE by your prediction function
predictions = predict(predictor, force_update=force_update, weights=weights)
# Save predictions, should be in the form 'list of tuples' or 'list of lists'
with open(submission_file, 'w') as submission_writer:
# Formates data
predictions = [map(str, row) for row in predictions]
predictions = [','.join(row) for row in predictions]
predictions = 'Id,Rating\n' + '\n'.join(predictions)
# Writes it dowmn
submission_writer.write(predictions)
# User - user and item - item collaborative filtering
naive_colab_predictor: RatingPredictor = RatingPredictor(
data_loader=data_loader,
disk_persistor=disk_persistor,
persistence_id='predictor_naive',
prediction_strategies=[
ItemNaiveCollaborativeFiltering(
k_neighbors=30,
sim_matrix=global_pearson_similarity_matrix_movie
),
UserNaiveCollaborativeFiltering(
k_neighbors=30,
sim_matrix=global_pearson_similarity_matrix_user
)
]
)
predict_and_write_to_file(naive_colab_predictor, True, [0.7, 0.3], 'data/submissions/naive_colab.csv')
# Clustering Collaborative Filtering
clustering_predictor: RatingPredictor = RatingPredictor(
data_loader=data_loader,
disk_persistor=disk_persistor,
persistence_id='predictor_clustering',
prediction_strategies=[
ClusterCollaborativeFiltering(
row_similarity_matrix=global_pearson_similarity_matrix_user,
col_similarity_matrix=global_pearson_similarity_matrix_movie,
new_dim_ratio=(0.8, 0.8),
k_neighbors=35,
randomized=True,
randomized_num_extractions=1000,
random_seed=3
)
]
)
predict_and_write_to_file(clustering_predictor, True, [1], 'data/submissions/clustering.csv')
# User - user and item - item baseline collaborative filtering
global_baseline_predictor: RatingPredictor = RatingPredictor(
data_loader=data_loader,
disk_persistor=disk_persistor,
persistence_id='predictor_baseline',
prediction_strategies=[
ItemGlobalBaselineCollaborativeFiltering(
k_neighbors=30,
sim_matrix=global_pearson_similarity_matrix_movie
),
UserGlobalBaselineCollaborativeFiltering(
k_neighbors=30,
sim_matrix=global_pearson_similarity_matrix_user
)
]
)
global_biased_UV: RatingPredictor = RatingPredictor(
data_loader=data_loader,
disk_persistor=disk_persistor,
persistence_id='predictor_baseline',
prediction_strategies=[
ItemGlobalBaselineCollaborativeFiltering(
k_neighbors=30,
sim_matrix=global_pearson_similarity_matrix_movie
),
UserGlobalBaselineCollaborativeFiltering(
k_neighbors=30,
sim_matrix=global_pearson_similarity_matrix_user
),
BiasUvDecomposer(
iterations=55,
d=7,
mu= 0.003,
delta1=0.10,
delta2=0.06,
bias_weight1=0.11,
bias_weight2=0.08,
formula_factory = formula_factory,
scorer_type=scoring_measure
)
]
)
predict_and_write_to_file(global_biased_UV, False, [0.4, 0.1, 0.5], 'data/submissions/new_Predictions1.csv')
#0.8036744443488222
#0.80265563268677