-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_duplicates.R
73 lines (49 loc) · 2.24 KB
/
count_duplicates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
library(data.table)
library(plyr)
trainData <- fread("case4_train_numeric.csv", header = TRUE, data.table = FALSE)
#Create a subset for the first 500k records
set.seed(123)
training_indices <- sample(seq_len(nrow(trainData)), size = 500000)
sample_500k <- trainData[training_indices,]
#Set values to 1 and missing values to 0
sample_500k[sample_500k <= 1 & sample_500k >= -1] <- 1
sample_500k[is.na(sample_500k)] <- 0
#Drop Id and Response
sample_500k_duplicates <- sample_500k[,2:969]
#Count the number of duplicates of each unique row based on a subset
batch <- sample_500k_duplicates[1:100000,]
aggregation <- aggregate(list(numdup = rep(1,nrow(batch))) , batch, length)
#Check the max, mean and min number of duplicates
max(aggregation$numdup)
min(aggregation$numdup)
mean(aggregation$numdup)
#Select the characteristic pattern of 0 and 1 from the major product group
select_max_row <- aggregation[aggregation$numdup == max(aggregation$numdup),]
select_max_row_search <- select_max_row
select_max_row_search$numdup
select_max_row_search$numdup <- NULL
#Extract the records from the total numeric dataset, This is done with the first 500k records
batch_size = 20000
res_duplicates<-data.frame()
for(i in 1:25){
batch <- sample_500k[1+((i-1)*batch_size):((i)*batch_size),]
res <- batch[which(apply(batch,1,function(x) all(select_max_row_search == x[2:969]))),]
res_duplicates <- rbind(res_duplicates, res)
}
#Afterwards it is done for the next 680k parts
sample_500k <- trainData[-training_indices,]
for(i in 1:34){
batch <- sample_500k[1+((i-1)*batch_size):((i)*batch_size),]
res <- batch[which(apply(batch,1,function(x) all(select_max_row_search == x[2:969]))),]
res_duplicates <- rbind(res_duplicates, res)
}
#Include remaining 3.747 records
batch <- batch <- sample_500k[1+((59)*batch_size):1183747,]
res <- batch[which(apply(batch,1,function(x) all(select_max_row_search == x[2:969]))),]
res_duplicates <- rbind(res_duplicates, res)
#Drop Columns that only contain missing values
nas <- apply(res_duplicates,2,function(x) sum(length(which(is.na(x)))))
data <- res_duplicates[, nas != nrow(res_duplicates)]
#drop column that contain 0 variance
data <- data[,apply(data, 2, function(x) var(x)!= 0)]
write_csv(data, "product_1.csv", col_names = TRUE)