-
Notifications
You must be signed in to change notification settings - Fork 5
/
buildThreeClassTrainingSet.py
32 lines (30 loc) · 1.07 KB
/
buildThreeClassTrainingSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import sys, os
def readAndLabelStats(statFileName, label):
examples = []
with open(statFileName) as statFile:
first = True
for line in statFile:
if first:
header = "classLabel\t"+line
first = False
else:
examples.append(str(label)+"\t"+line)
return header, examples
statDir, trainingSetFileName = sys.argv[1:]
headerH = {}
header, migrantTrainingSet = readAndLabelStats("%s/mig12.msOut" %(statDir), 1)
headerH[header] = 1
header, revMigrantTrainingSet = readAndLabelStats("%s/mig21.msOut" %(statDir), 2)
headerH[header] = 1
header, nonMigrantTrainingSet = readAndLabelStats("%s/noMig.msOut" %(statDir), 0)
headerH[header] = 1
if len(headerH) != 1:
sys.exit("Not all headers are identical. AAAARRRGGGGHHHHHH!!!\n")
with open(trainingSetFileName, "w") as outFile:
outFile.write(header)
for line in migrantTrainingSet:
outFile.write(line)
for line in revMigrantTrainingSet:
outFile.write(line)
for line in nonMigrantTrainingSet:
outFile.write(line)