-
Notifications
You must be signed in to change notification settings - Fork 4
/
main.cpp
1427 lines (1188 loc) · 44.8 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//Netflix Prize
//Jordan Turley and Michael Lamar
//
//This implementation is using a sphere
//For a data point, the user and movie rating vectors are attracted together
//Then, the user and movie rating are repelled away from random movie ratings
//and users, independently.
//Then, the user is repelled from a random user and the movie-rating from a random movie-rating
//Then, the movie-rating is repelled from the four other movie-ratings for the same movie.
//
//Eta is the strength of the attraction and repulsion
//Phi is used to slow down attraction and repulsion as a vector is seen over and over
//
//The current z is printed out 100 times every iteration (every %), as well as
//the average distance between two movie-ratings, two users, a random user and
//movie-rating, and a user movie-rating pair from a random data point. The
//likelihood value is also printed out.
//
//The RMSE is printed after each iteration.
//
//The data is randomly split into three separate sets. One for training, one
//for validation, and one for test. The model is trained on the training set.
//Then, after each iteration, the RMSE for the model is calculated on the
//validation set. After we have everything configured, the model will be
//evaluated on the test set.
//
//To compile: g++ main.cpp -o netflix -std=c++11 -O3
//To run on Linux: ./netflix input_file.txt data_file.bin 100480507
//To run on Windows: netflix input_file.txt data_file.bin 100480507
//The number at the end can be any number, as long as it is greater than the sample sizes.
//For example, 1000000 to just run it on the first 1000000 data points
//Running the program on the full data set takes almost 4.5 GB of RAM
#include <iostream>
#include <fstream>
#include <random>
#include <ctime>
#include <algorithm>
using namespace std;
//Struct to hold settings from file
//Returned from readSettings
struct Settings {
int dimensions;
double eta;
int phiUser;
int phiMR;
int iterations;
int repelNum;
};
//Struct to hold data read from binary file
//Returned from readData
struct Data {
int** data;
int* dataIndices;
int maxUserId;
int maxMovieId;
};
//Struct to hold vector arrays for generating vectors.
//Returned from generateVectors
struct Vectors {
double** userVectors;
int *userCounts;
int *userCumulativeCounts;
int totalUsers;
double*** movieRatingVectors;
int **movieRatingCounts;
int **movieRatingCumulativeCounts;
int totalMovies;
};
//Struct to hold three sets: training, validation, and test
//Returned from splitDatasets
struct Datasets {
int* trainIndices;
int trainSize;
int* validationIndices;
int validationSize;
int* testIndices;
int testSize;
};
//Struct to hold initial value of z and array of sample values of z
//Returned from calculateInitialZ
struct ZValues {
double z;
double* zValues;
};
struct Settings readSettings(char* file);
struct Data readData(char* file, int numDataPoints);
struct Vectors generateVectors(
int** data,
int numDataPoints,
int maxUserId,
int maxMovieId,
int dimensions);
struct Datasets splitDatasets(int* dataIndices, int numDataPoints);
int* generateSet(int* dataIndices, int startIdx, int endIdx);
struct ZValues calculateInitialZ(
int* trainIndices,
int trainSize,
int** data,
double** userVectors,
double*** movieRatingVectors,
mt19937 random,
uniform_int_distribution<int> randomDataPoint,
int sampleSize,
int dimensions);
double getDistanceSquared(double a, double b);
double getDistanceSquared(double *a, double *b, int dimensions);
double norm2(double *arr, int length);
double calculateEta(double etaInitial, double phi, int count);
void moveVectors(
double* user,
double* mr,
double movieRating,
double etaInitial,
double userEta,
double mrEta,
double** userVectorsRepel,
double** mrVectorsRepel,
int numRepel,
double* randomUser,
double* randomMR,
int dimensions,
double z,
double euu,
double emrmr);
double attract(double a, double b, double eta);
double repel(double a, double other, double eta, double z);
double normalize(double a, double initialEta, double delta, double aNorm2);
double normalizeUnitLength(double a, double aNorm2);
double calculateRMSE(
int* evaluationIndices,
int evaluationSize,
int** data,
double** userVectors,
double*** movieRatingVectors,
int** movieRatingCounts,
int dimensions);
double calculateRMSEEmpirical(
int* evaluationIndices,
int evaluationSize,
int** data,
int** movieRatingCounts);
void writeBarGraphValues(
int** data,
double** userVectors,
double*** movieRatingVectors,
int** movieRatingCounts,
int dimensions,
int barGraphIndices[]);
const int NUM_PARTS = 3; //How many parts are there to each triple? 3 - user id, movie id, rating
const int USER_ID_IDX = 0; //The index of the user id in the array storing the triple
const int MOVIE_ID_IDX = 1; //The index of the movie id in the array storing the triple
const int MOVIE_RATING_IDX = 2; //The index of the movie rating in the array storing the triple
const int MAX_STARS = 5;
//Sample size when calculating average distances for debugging
//Ex. average distance between two random users, two random movie-ratings, ...
const int AVERAGE_SAMPLE_SIZE = 10000;
//Sizes for each set
const double TRAIN_SIZE = 0.8;
const double VALIDATION_SIZE = 0.1;
const double TEST_SIZE = 1 - TRAIN_SIZE - VALIDATION_SIZE;
const int Z_SAMPLE_SIZE = 10000;
//TODO move this into main if it is calculated depending on something like dimensions
const double SCALING_FACTOR = 1;
//The input file has:
//dimensions
//initial eta
//phi for users
//phi for movie ratings
//number of iterations
//z sample size
//score sample size
const int BAR_GRAPH_COUNT = 1;
int main(int argc, char *argv[]) {
//Seed general random number generator
srand(time(0));
//Get the input file from command line
char *settingsFile = (char *) "C:\\input_sphere.txt";
if (argc > 1) { //The first command-line argument is the name of the program
settingsFile = argv[1];
}
//Get the data file from command line
char *dataFile = (char *) "C:\\netflix_data_c.bin";
if (argc > 2) {
dataFile = argv[2];
}
//Get the number of data points from command line
int numDataPoints = 1000000;
if (argc > 3) {
numDataPoints = strtol(argv[3], NULL, 10);
}
//Read in settings from the input file
struct Settings settings = readSettings(settingsFile);
//Pull a few settings out of the struct since it is used a lot
int dimensions = settings.dimensions;
//Calculate delta based on eta
double delta = 1 / (4 * settings.eta);
cout << "Reading in data" << endl;
//Read in the data points from the binary file
struct Data dataStruct = readData(dataFile, numDataPoints);
int** data = dataStruct.data;
int* dataIndices = dataStruct.dataIndices;
int maxUserId = dataStruct.maxUserId;
int maxMovieId = dataStruct.maxMovieId;
cout << "Max user id: " << maxUserId << endl;
cout << "Max movie id: " << maxMovieId << endl;
cout << "Initializing vectors" << endl;
//Generate the vectors
struct Vectors vectors = generateVectors(data, numDataPoints, maxUserId, maxMovieId, dimensions);
//Get the vector and count arrays from the struct
double** userVectors = vectors.userVectors;
int *userCounts = vectors.userCounts;
int *userCumulativeCounts = vectors.userCumulativeCounts;
double*** movieRatingVectors = vectors.movieRatingVectors;
int **movieRatingCounts = vectors.movieRatingCounts;
int **movieRatingCumulativeCounts = vectors.movieRatingCumulativeCounts;
cout << "Number of users: " << vectors.totalUsers << endl;
cout << "Number of movies: " << vectors.totalMovies << endl;
//Split the data into three datasets: training, validation, and test
struct Datasets datasets = splitDatasets(dataIndices, numDataPoints);
int* trainIndices = datasets.trainIndices;
int trainSize = datasets.trainSize;
int* validationIndices = datasets.validationIndices;
int validationSize = datasets.validationSize;
int* testIndices = datasets.testIndices;
int testSize = datasets.testSize;
//Init random data point generator from training set
mt19937 random(time(0));
uniform_int_distribution<int> randomDataPoint(0, trainSize - 1);
//Print out the size of each set
cout << "Set sizes:" << endl;
cout << "Training set: " << trainSize << endl;
cout << "Validation set: " << validationSize << endl;
cout << "Test set: " << testSize << endl;
//Clear out the original array of data indices after it's split up
delete[] dataIndices;
cout << "Calculating initial value of Z" << endl;
//Calculate the initial value of z
struct ZValues zStruct = calculateInitialZ(
trainIndices,
trainSize,
data,
userVectors,
movieRatingVectors,
random,
randomDataPoint,
Z_SAMPLE_SIZE,
dimensions);
double z = zStruct.z;
double* zValues = zStruct.zValues;
int oldestIdx = 0;
//Calculate average value of exp[-d2(u, u)] and exp[-d2(mr, mr)]
double euu = 0;
double* euuValues = new double[Z_SAMPLE_SIZE];
double emrmr = 0;
double* emrmrValues = new double[Z_SAMPLE_SIZE];
for (int i1 = 0; i1 < Z_SAMPLE_SIZE; i1++) {
int user1Idx = trainIndices[randomDataPoint(random)];
int* user1DataPt = data[user1Idx];
int user1Id = user1DataPt[USER_ID_IDX];
double* user1Vector = userVectors[user1Id - 1];
int user2Idx = trainIndices[randomDataPoint(random)];
int* user2DataPt = data[user2Idx];
int user2Id = user2DataPt[USER_ID_IDX];
double* user2Vector = userVectors[user2Id - 1];
double euuVal = exp(SCALING_FACTOR * -getDistanceSquared(user1Vector, user2Vector, dimensions));
euu += euuVal;
euuValues[i1] = euuVal;
int mr1Idx = trainIndices[randomDataPoint(random)];
int* mr1DataPt = data[mr1Idx];
int movie1Id = mr1DataPt[MOVIE_ID_IDX];
int movie1Rating = mr1DataPt[MOVIE_RATING_IDX];
double* mr1Vector = movieRatingVectors[movie1Id - 1][movie1Rating - 1];
int mr2Idx = trainIndices[randomDataPoint(random)];
int* mr2DataPt = data[mr2Idx];
int movie2Id = mr2DataPt[MOVIE_ID_IDX];
int movie2Rating = mr2DataPt[MOVIE_RATING_IDX];
double* mr2Vector = movieRatingVectors[movie2Id - 1][movie2Rating - 1];
double emrmrVal = exp(SCALING_FACTOR * -getDistanceSquared(mr1Vector, mr2Vector, dimensions));
emrmr += euuVal;
emrmrValues[i1] = emrmrVal;
}
euu /= Z_SAMPLE_SIZE;
emrmr /= Z_SAMPLE_SIZE;
//Save the initial z in case we need to use it later, and print it out
double initialZ = z;
cout << "Initial z: " << z << endl;
//Calculate the RMSE using only the empirical probabilities w/o the model
double rmseEmpirical = calculateRMSEEmpirical(validationIndices, validationSize, data, movieRatingCounts);
cout << "Empirical_RMSE: " << rmseEmpirical << endl;
//Go through number of iterations to move vectors
for (int iteration = 0; iteration < settings.iterations; iteration++) {
random_shuffle(&trainIndices[0], &trainIndices[trainSize - 1]);
int reportNum = trainSize / 100;
cout << "Starting iteration " << iteration + 1 << endl;
//Go through each data point in the training set
for (int dataIdx = 0; dataIdx < trainSize; dataIdx++) {
int idx = trainIndices[dataIdx];
int* dataPt = data[idx];
//Get the user id, movie id, and movie rating from the data point
int userId = dataPt[USER_ID_IDX];
int movieId = dataPt[MOVIE_ID_IDX];
int movieRating = dataPt[MOVIE_RATING_IDX];
//Update cumulative counts
userCumulativeCounts[userId - 1]++;
movieRatingCumulativeCounts[movieId - 1][movieRating - 1]++;
//Get the vectors and calculate eta for the user and movie rating vectors
double* userVec = userVectors[userId - 1];
double userEta = calculateEta(settings.eta, settings.phiUser, userCumulativeCounts[userId - 1]);
double* movieRatingVec = movieRatingVectors[movieId - 1][movieRating - 1];
double mrEta = calculateEta(settings.eta, settings.phiMR, movieRatingCumulativeCounts[movieId - 1][movieRating - 1]);
double** userVectorsRepel = new double*[settings.repelNum];
double** mrVectorsRepel = new double*[settings.repelNum];
//Get small sample of vectors to repel from
for (int i1 = 0; i1 < settings.repelNum; i1++) {
//Get a user vector
int idxTemp = randomDataPoint(random);
int* dataPtTemp = data[trainIndices[idxTemp]];
int userIdTemp = dataPtTemp[USER_ID_IDX];
double* userVecTemp = userVectors[userIdTemp - 1];
userVectorsRepel[i1] = userVecTemp;
idxTemp = randomDataPoint(random);
dataPtTemp = data[trainIndices[idxTemp]];
int movieIdTemp = dataPtTemp[MOVIE_ID_IDX];
int movieRatingTemp = dataPtTemp[MOVIE_RATING_IDX];
double* mrVecTemp = movieRatingVectors[movieIdTemp - 1][movieRatingTemp - 1];
mrVectorsRepel[i1] = mrVecTemp;
}
//Get more random vectors for user-user and mr-mr repulsion
int randomUserDataIdx = randomDataPoint(random);
idx = trainIndices[randomUserDataIdx];
dataPt = data[idx];
int randomUserId = dataPt[USER_ID_IDX];
double* randomUserVec = userVectors[randomUserId - 1];
int randomMRDataIdx = randomDataPoint(random);
idx = trainIndices[randomMRDataIdx];
dataPt = data[idx];
int randomMovieId = dataPt[MOVIE_ID_IDX];
int randomMovieRating = dataPt[MOVIE_RATING_IDX];
double* randomMRVec = movieRatingVectors[randomMovieId - 1][randomMovieRating - 1];
//Move the vectors toward each other, and away from the randomly chosen vectors
moveVectors(
userVec,
movieRatingVec,
movieRating,
settings.eta,
userEta,
mrEta,
userVectorsRepel,
mrVectorsRepel,
settings.repelNum,
randomUserVec,
randomMRVec,
dimensions,
z,
euu,
emrmr);
//Deallocate the vector repel arrays
delete[] userVectorsRepel;
delete[] mrVectorsRepel;
//Get random new vectors to update z with
randomUserDataIdx = randomDataPoint(random);
idx = trainIndices[randomUserDataIdx];
dataPt = data[idx];
randomUserId = dataPt[USER_ID_IDX];
randomUserVec = userVectors[randomUserId - 1];
randomMRDataIdx = randomDataPoint(random);
idx = trainIndices[randomMRDataIdx];
dataPt = data[idx];
randomMovieId = dataPt[MOVIE_ID_IDX];
randomMovieRating = dataPt[MOVIE_RATING_IDX];
randomMRVec = movieRatingVectors[randomMovieId - 1][randomMovieRating - 1];
//Actually update the value of z
double oldZVal = zValues[oldestIdx];
double newZVal = exp(SCALING_FACTOR * -getDistanceSquared(randomUserVec, randomMRVec, dimensions));
zValues[oldestIdx] = newZVal;
z += (newZVal - oldZVal) / Z_SAMPLE_SIZE;
//Get random new vectors to update euu with
randomUserDataIdx = randomDataPoint(random);
idx = trainIndices[randomUserDataIdx];
dataPt = data[idx];
randomUserId = dataPt[USER_ID_IDX];
double* randomUserVec1 = userVectors[randomUserId - 1];
randomUserDataIdx = randomDataPoint(random);
idx = trainIndices[randomUserDataIdx];
dataPt = data[idx];
randomUserId = dataPt[USER_ID_IDX];
double* randomUserVec2 = userVectors[randomUserId - 1];
//Update value of euu
double oldEuuVal = euuValues[oldestIdx];
double newEuuVal = exp(SCALING_FACTOR * -getDistanceSquared(randomUserVec1, randomUserVec2, dimensions));
euuValues[oldestIdx] = newEuuVal;
euu += (newEuuVal - oldEuuVal) / Z_SAMPLE_SIZE;
//Get random new vectors to update emrmr with
randomMRDataIdx = randomDataPoint(random);
idx = trainIndices[randomMRDataIdx];
dataPt = data[idx];
randomMovieId = dataPt[MOVIE_ID_IDX];
randomMovieRating = dataPt[MOVIE_RATING_IDX];
double* randomMRVec1 = movieRatingVectors[randomMovieId - 1][randomMovieRating - 1];
randomMRDataIdx = randomDataPoint(random);
idx = trainIndices[randomMRDataIdx];
dataPt = data[idx];
randomMovieId = dataPt[MOVIE_ID_IDX];
randomMovieRating = dataPt[MOVIE_RATING_IDX];
double* randomMRVec2 = movieRatingVectors[randomMovieId - 1][randomMovieRating - 1];
//Update value of emrmr
double oldEmrmrVal = emrmrValues[oldestIdx];
double newEmrmrVal = exp(SCALING_FACTOR * -getDistanceSquared(randomMRVec1, randomMRVec2, dimensions));
emrmrValues[oldestIdx] = newEmrmrVal;
emrmr += (newEmrmrVal - oldEmrmrVal) / Z_SAMPLE_SIZE;
oldestIdx++;
oldestIdx %= Z_SAMPLE_SIZE;
//Print out logging information 100 times an iterations
if (dataIdx % reportNum == 0) { //Print out Z and the percentage completed of the iteration
double perc = (double) dataIdx / trainSize * 100;
cout << perc << "%, Z: " << z << endl;
//Calculate averages for data collection:
//Two random movie ratings, two random users, a random user and random movie rating, and a user and movie rating from a random data point
//First, two random movie ratings
double mrmrAvg = 0;
for (int i1 = 0; i1 < AVERAGE_SAMPLE_SIZE; i1++) {
//Pick two random movie ratings
int index1 = randomDataPoint(random);
int index2 = randomDataPoint(random);
index1 = trainIndices[index1];
index2 = trainIndices[index2];
int* dataPt1 = data[index1];
int movieId1 = dataPt1[MOVIE_ID_IDX];
int movieRating1 = dataPt1[MOVIE_RATING_IDX];
double *movieRatingVec1 = movieRatingVectors[movieId1 - 1][movieRating1 - 1];
int* dataPt2 = data[index2];
int movieId2 = dataPt2[MOVIE_ID_IDX];
int movieRating2 = dataPt2[MOVIE_RATING_IDX];
double *movieRatingVec2 = movieRatingVectors[movieId2 - 1][movieRating2 - 1];
//Calculate distance between these two
double distance = sqrt(getDistanceSquared(movieRatingVec1, movieRatingVec2, dimensions));
mrmrAvg += distance;
}
mrmrAvg /= AVERAGE_SAMPLE_SIZE;
cout << "MRMR: " << mrmrAvg << endl;
//Then, two random users
double useruserAvg = 0;
for (int i1 = 0; i1 < AVERAGE_SAMPLE_SIZE; i1++) {
int index1 = randomDataPoint(random);
int index2 = randomDataPoint(random);
index1 = trainIndices[index1];
index2 = trainIndices[index2];
int *dataPt1 = data[index1];
int userId1 = dataPt1[USER_ID_IDX];
double *userVec1 = userVectors[userId1 - 1];
int *dataPt2 = data[index2];
int userId2 = dataPt2[USER_ID_IDX];
double *userVec2 = userVectors[userId2 - 1];
double distance = sqrt(getDistanceSquared(userVec1, userVec2, dimensions));
useruserAvg += distance;
}
useruserAvg /= AVERAGE_SAMPLE_SIZE;
cout << "User_User: " << useruserAvg << endl;
//Then, a random user and random movie rating
double randUserMrAvg = 0;
for (int i1 = 0; i1 < AVERAGE_SAMPLE_SIZE; i1++) {
int userIndex = randomDataPoint(random);
int mrIndex = randomDataPoint(random);
userIndex = trainIndices[userIndex];
mrIndex = trainIndices[mrIndex];
int *userDataPt = data[userIndex];
int userId = userDataPt[USER_ID_IDX];
double *userVec = userVectors[userId - 1];
int *mrDataPt = data[mrIndex];
int movieId = mrDataPt[MOVIE_ID_IDX];
int movieRating = mrDataPt[MOVIE_RATING_IDX];
double *mrVec = movieRatingVectors[movieId - 1][movieRating - 1];
double distance = sqrt(getDistanceSquared(userVec, mrVec, dimensions));
randUserMrAvg += distance;
}
randUserMrAvg /= AVERAGE_SAMPLE_SIZE;
cout << "Rand_User_MR: " << randUserMrAvg << endl;
//Finally, distance between user and movie rating for a random data point
double usermrAvg = 0;
for (int i1 = 0; i1 < AVERAGE_SAMPLE_SIZE; i1++) {
int dataIdx = randomDataPoint(random);
dataIdx = trainIndices[dataIdx];
int *dataPt = data[dataIdx];
int userId = dataPt[USER_ID_IDX];
double *userVec = userVectors[userId - 1];
int movieId = dataPt[MOVIE_ID_IDX];
int movieRating = dataPt[MOVIE_RATING_IDX];
double *mrVec = movieRatingVectors[movieId - 1][movieRating - 1];
double distance = sqrt(getDistanceSquared(userVec, mrVec, dimensions));
usermrAvg += distance;
}
usermrAvg /= AVERAGE_SAMPLE_SIZE;
cout << "User_MR: " << usermrAvg << endl;
//Calculate the likelihood
double likelihoodAvg = 0;
for (int i1 = 0; i1 < AVERAGE_SAMPLE_SIZE; i1++) {
//Get a random data index
int dataIdx = randomDataPoint(random);
dataIdx = trainIndices[dataIdx];
//Get the data point: the user id, movie id, and rating
int *dataPt = data[dataIdx];
//Get the user vector
int userId = dataPt[USER_ID_IDX];
double *userVec = userVectors[userId - 1];
//Get the movie rating vector
int movieId = dataPt[MOVIE_ID_IDX];
int movieRating = dataPt[MOVIE_RATING_IDX];
double *mrVec = movieRatingVectors[movieId - 1][movieRating - 1];
//Calculate pbar for the user and movie rating
double userPBar = (double) userCounts[userId - 1] / numDataPoints;
double mrPBar = (double) movieRatingCounts[movieId - 1][movieRating - 1] / numDataPoints;
double likelihood = userPBar * mrPBar * exp(SCALING_FACTOR * -getDistanceSquared(userVec, mrVec, dimensions)) / z;
likelihoodAvg += likelihood;
}
likelihoodAvg /= AVERAGE_SAMPLE_SIZE;
cout << "Likelihood: " << likelihoodAvg << endl;
}
}
//Find the RMSE after each iteration to see if it is improving
double rmse = calculateRMSE(
validationIndices,
validationSize,
data,
userVectors,
movieRatingVectors,
movieRatingCounts,
dimensions);
cout << "Model_RMSE: " << rmse << endl;
}
//Clear out all of the dynamic arrays before stopping
for (int i1 = 0; i1 < numDataPoints; i1++) {
delete[] data[i1];
}
delete[] data;
for (int i1 = 0; i1 < maxUserId; i1++) {
delete[] userVectors[i1];
}
delete[] userVectors;
delete[] userCounts;
delete[] userCumulativeCounts;
for (int i1 = 0; i1 < maxMovieId; i1++) {
for (int i2 = 0; i2 < MAX_STARS; i2++) {
delete[] movieRatingVectors[i1][i2];
}
delete[] movieRatingVectors[i1];
delete[] movieRatingCounts[i1];
delete[] movieRatingCumulativeCounts[i1];
}
delete[] movieRatingVectors;
delete[] movieRatingCounts;
delete[] movieRatingCumulativeCounts;
delete[] trainIndices;
delete[] validationIndices;
delete[] testIndices;
delete[] zValues;
return 0;
}
/**
* Read in the settings for the run from a given input file
* The input file contains, on each line in this order:
* number of dimensions (integer)
* initial value of eta (double)
* user phi value (integer)
* movie rating phi value (integer)
* number of iterations to run for (integer)
* sample size for calculating z (integer)
* sample size for calculating RMSE score (integer)
* @param file The file to read the settings from
* @return Settings struct containing all settings values
*/
struct Settings readSettings(char* file) {
ifstream settingsInput(file, ios::in);
//Initialize variables to hold all the settings
int dimensions;
double eta; //The initial value of eta to use in calculating eta
int phiUser;
int phiMR;
int iterations;
int repelNum;
settingsInput >> dimensions;
settingsInput >> eta;
settingsInput >> phiUser;
settingsInput >> phiMR;
settingsInput >> iterations;
settingsInput >> repelNum;
settingsInput.close();
struct Settings settings;
settings.dimensions = dimensions;
settings.eta = eta;
settings.phiUser = phiUser;
settings.phiMR = phiMR;
settings.iterations = iterations;
settings.repelNum = repelNum;
return settings;
}
/**
* Reads in all of the data points from the given file. The file must be binary
* and each data point is stored sequentially (user id, movie id, movie rating)
* @param file The binary file to read the data from
* @param numDataPoints The number of data points to read from the file
* @return The array of data points, a vector of indices used for shuffling,
* and the maximum user and movie ids for vector generation, in a struct.
*/
struct Data readData(char* file, int numDataPoints) {
ifstream dataFile(file, ios::in | ios::binary);
//Initialize array to hold all data points
int **data = new int*[numDataPoints];
//Init array to hold Indices of all data points
//Just holds the numbers 0, 1, ... 100480506
//This is used to shuffle to be able to go through the data in a random order
int* dataIndices = new int[numDataPoints];
for (int i1 = 0; i1 < numDataPoints; i1++) {
data[i1] = new int[NUM_PARTS];
dataIndices[i1] = i1;
//dataIndices.push_back(i1);
}
int maxUserId = 0;
int maxMovieId = 0;
//Go through and read in all the data
for (int triple = 0; triple < numDataPoints; triple++) {
for (int part = 0; part < NUM_PARTS; part++) {
int in;
dataFile.read((char *)&in, sizeof(int));
data[triple][part] = in;
}
int userId = data[triple][USER_ID_IDX];
int movieId = data[triple][MOVIE_ID_IDX];
//Find max user and movie ids
if (userId > maxUserId) {
maxUserId = userId;
}
if (movieId > maxMovieId) {
maxMovieId = movieId;
}
}
dataFile.close();
//Put everything in the struct and return it
struct Data dataStruct;
dataStruct.data = data;
dataStruct.maxUserId = maxUserId;
dataStruct.maxMovieId = maxMovieId;
dataStruct.dataIndices = dataIndices;
return dataStruct;
}
/**
* Generates vectors for each user id and movie id in the dataset. Each user
* gets a vector and each movie gets five; one for each rating it can receive.
* @param data The array of data points to generate for
* @param numDataPoints The number of data points in the array
* @param maxUserId The maximum user id, used for initializing the user vector
* array
* @param maxMovieId The maximum movie-rating id, used for initializing the
* movie-rating vector array
* @param dimensions The number of dimensions each vector should have
* @return The arrays of vectors for users and movie-ratings, the counts of
* each user and movie-rating for calculating empirical probabilities, and the
* number of distinct users and movies in the dataset, in a struct.
*/
struct Vectors generateVectors(
int** data,
int numDataPoints,
int maxUserId,
int maxMovieId,
int dimensions) {
//Initialize random number generators
mt19937 random(time(0));
normal_distribution<float> randomNormal;
//Init array to hold user vectors and to hold user counts
double **userVectors = new double*[maxUserId];
int *userCounts = new int[maxUserId]; //To calculate the empirical probability
int *userCumulativeCounts = new int[maxUserId]; //To calculate eta
//Init each element of arrays
for (int i1 = 0; i1 < maxUserId; i1++) {
userVectors[i1] = NULL;
userCounts[i1] = 0;
userCumulativeCounts[i1] = 0;
}
//Init array to hold movie rating vectors
double ***movieRatingVectors = new double**[maxMovieId];
int **movieRatingCounts = new int*[maxMovieId]; //To calculate the empirical probability
int **movieRatingCumulativeCounts = new int*[maxMovieId]; //To calculate eta
//Init each element of arrays
for (int i1 = 0; i1 < maxMovieId; i1++) {
movieRatingVectors[i1] = NULL;
movieRatingCounts[i1] = new int[MAX_STARS];
movieRatingCumulativeCounts[i1] = new int[MAX_STARS];
for (int i2 = 0; i2 < MAX_STARS; i2++) {
movieRatingCounts[i1][i2] = 0;
movieRatingCumulativeCounts[i1][i2] = 0;
}
}
//Init number of users and movies
int numUsers = 0;
int numMovies = 0;
//Go through the data and generate the vectors
for (int i1 = 0; i1 < numDataPoints; i1++) {
int *dataPt = data[i1];
int userId = dataPt[USER_ID_IDX];
int movieId = dataPt[MOVIE_ID_IDX];
int movieRating = dataPt[MOVIE_RATING_IDX];
userCounts[userId - 1]++;
movieRatingCounts[movieId - 1][movieRating - 1]++;
//Only generate a new vector if this user doesn't have one yet
if (userVectors[userId - 1] == NULL) {
numUsers++; //Increment number of users
userVectors[userId - 1] = new double[dimensions];
double length = 0;
for (int dimension = 0; dimension < dimensions; dimension++) {
double component = randomNormal(random);
userVectors[userId - 1][dimension] = component;
length += pow(component, 2);
}
length = sqrt(length);
for (int dimension = 0; dimension < dimensions; dimension++) {
userVectors[userId - 1][dimension] /= length;
}
}
//Only generate new vectors of the movie doesn't have vectors yet
if (movieRatingVectors[movieId - 1] == NULL) {
numMovies++;
movieRatingVectors[movieId - 1] = new double*[MAX_STARS];
for (int star = 0; star < MAX_STARS; star++) {
movieRatingVectors[movieId - 1][star] = new double[dimensions];
double length = 0;
for (int dimension = 0; dimension < dimensions; dimension++) {
double component = randomNormal(random);
movieRatingVectors[movieId - 1][star][dimension] = component;
length += pow(component, 2);
}
length = sqrt(length);
for (int dimension = 0; dimension < dimensions; dimension++) {
movieRatingVectors[movieId - 1][star][dimension] /= length;
}
}
}
}
//Stick everything in the struct and return it
struct Vectors vectors;
vectors.userVectors = userVectors;
vectors.userCounts = userCounts;
vectors.userCumulativeCounts = userCumulativeCounts;
vectors.totalUsers = numUsers;
vectors.movieRatingVectors = movieRatingVectors;
vectors.movieRatingCounts = movieRatingCounts;
vectors.movieRatingCumulativeCounts = movieRatingCumulativeCounts;
vectors.totalMovies = numMovies;
return vectors;
}
/**
* Splits the original dataset into three separate, unique datasets.
* @param dataIndices The full vector of all indices of data points
* @return Vectors holding the indices of the data points in the training,
* validation, and test datasets, in a struct.
*/
struct Datasets splitDatasets(int* dataIndices, int numDataPoints) {
//Shuffle the data indices
random_shuffle(&dataIndices[0], &dataIndices[numDataPoints - 1]); //dataIndices.begin(), dataIndices.end());
//Split up the data into training, validation, and test sets
int trainIdxStart = 0;
int trainIdxEnd = TRAIN_SIZE * numDataPoints;
int validationIdxStart = trainIdxEnd + 1;
int validationIdxEnd = validationIdxStart + VALIDATION_SIZE * numDataPoints;
int testIdxStart = validationIdxEnd + 1;
int testIdxEnd = numDataPoints - 1;
int* trainIndices = generateSet(dataIndices, trainIdxStart, trainIdxEnd);
int* validationIndices = generateSet(dataIndices, validationIdxStart, validationIdxEnd);
int* testIndices = generateSet(dataIndices, testIdxStart, testIdxEnd);
struct Datasets datasets;
datasets.trainIndices = trainIndices;
datasets.trainSize = trainIdxEnd - trainIdxStart + 1;
datasets.validationIndices = validationIndices;
datasets.validationSize = validationIdxEnd - validationIdxStart + 1;
datasets.testIndices = testIndices;
datasets.testSize = testIdxEnd - testIdxStart + 1;
return datasets;
}
/**
* Generates a set from the original set of data indices with a given starting
* and ending index.
* @param dataIndices The original set of data indices
* @param startIdx The starting index of the resulting set
* @param endIdx The ending index of the resulting set
* @return The resulting set of indices
*/
int* generateSet(int* dataIndices, int startIdx, int endIdx) {
int* indices = new int[endIdx - startIdx + 1];
int c = 0;
for (int i1 = startIdx; i1 <= endIdx; i1++) {
indices[c] = dataIndices[i1];
c++;
}
return indices;
}
/**
* Samples from the training data and calculates the initial value of z.
* @param trainIndices The array of indices of data points in the training set
* @param data The array of all data points
* @param userVectors The array of all user vectors
* @param movieRatingVectors The array of all movie-rating vectors
* @param random The Mersenne Twister 19937 generator for random numbers
* @param randomDataPoint The uniform int distribution random number generator
* @param sampleSize The number of data points to sample in calculating z
* @param dimensions The dimensionality of the vectors
* @return The initial value of z, as well as each value of z sampled, for
* updating the average later when we remove a data point, in a struct.
*/
struct ZValues calculateInitialZ(
int* trainIndices,
int trainSize,
int** data,
double** userVectors,
double*** movieRatingVectors,
mt19937 random,
uniform_int_distribution<int> randomDataPoint,
int sampleSize,
int dimensions) {
double z = 0;
double *zValues = new double[sampleSize];
//Random shuffle for sample for dist2
//random_shuffle(trainIndices.begin(), trainIndices.end());
random_shuffle(&trainIndices[0], &trainIndices[trainSize - 1]);
//Go through samples and calculate z and dist2
for (int i1 = 0; i1 < sampleSize; i1++) {
int userIdx = trainIndices[randomDataPoint(random)];
int *userSampleDataPt = data[userIdx];
int userId = userSampleDataPt[USER_ID_IDX];
double *userVec = userVectors[userId - 1];