-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4.1.Standardize.all.random.forest.py
106 lines (66 loc) · 3.1 KB
/
4.1.Standardize.all.random.forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import sys, os
from collections import Counter
import sklearn.metrics
lista_train= glob.glob("./Data/*train.csv")
lista_train=sorted(lista_train)
lista_train=lista_train[:-1] #remove OTUS
iterations=range(0,110,10)
iterations=iterations[1:]
#iterations=range(5,10,2)#
fout=open("./4.1.random.forest/R4.1.all."+sys.argv[1]+".out", "w")
for ada in iterations:
clf=RandomForestClassifier(max_depth=None, n_estimators=ada, max_features="auto")
metadata_train= pd.read_csv(open('./Data/01metadata.Train.csv'), index_col =0)
metadata_train.index=map(str, metadata_train.index)
metadata_test= metadata_train
species=sorted(list(set(metadata_test["Species"])))
dic_data={}
for k in species:
dic_data[k]=[]
lista_train_data=[]
for i in lista_train:
lista_train_data.append(pd.read_csv(open(i), index_col =0, dtype={'user_id':pd.np.int64}, low_memory=False))
for numero in range(0,len(lista_train)):
i=lista_train[numero] #1
#Open files
data_train = lista_train_data[numero]#pd.read_csv(open(i), index_col =0, dtype={'user_id':pd.np.int64})
data_train.index=map(str, data_train.index)
data_test = data_train
#select the unique species
for sp in species:
#sp=species[17]
#I
#select from the metadata the sps that are matching with the the species of the round
sp_selected=metadata_test.loc[metadata_test['Species'] == sp]
sp_sample_names=(sp_selected.index)
#keep in the _test only names selected
y_test=metadata_test.loc[sp_sample_names,"MicAbundance"]#Need to get out of pandas
X_test=data_test.loc[sp_sample_names,:].values
#II
#select from the metadata test the sps that are matching with the the species of the round
sp_selected=metadata_train.loc[metadata_train['Species'] == sp]
sp_sample_names=(sp_selected.index)
#keep in the _train everything, except names selected
idx = data_train.index.isin(sp_sample_names) #get index of samples to exclue
X_train=data_train[~idx].values
y_train=metadata_train[~idx].loc[:,"MicAbundance"]#Need to get out of pandas
#PERMUTATION!
rng = np.random.RandomState(0)
permutation = rng.permutation(len(X_train))
X_train, y_train = X_train[permutation], y_train[permutation]
clf.fit(X_train, y_train)
prediction = clf.predict(X_test) # accuracy
score = clf.score(X_test, y_test) #Test samples, True labels #SEEMS TO BE THE SAME AS NUMBER OF CORRECTLY CLASSIFIED.
score_f1=sklearn.metrics.f1_score(y_test, prediction, average="binary", pos_label=y_test[0])#Ground truth (correct) target values, Estimated targets as returned by a classifier. GET F1 just for the label being tested
#Calculate percentage scores
count=Counter(prediction)
percentage={"HMA":count["HMA"], "LMA":count["LMA"]}
percentage_score= percentage[y_test[0]]*100/len(y_test)
#print score, percentage_score,score_f1, prediction, y_test
#write results
fout.write(sp + "\t"+ str(score)+"\t"+ str(percentage_score)+"\t"+str(score_f1)+"\t"+i.split("/")[2].split(".")[0]+"\t" + str(ada) + "\n") #ADD another score
fout.close()