-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path11_class_pat_seg.R
147 lines (124 loc) · 5.19 KB
/
11_class_pat_seg.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# TODO: need to create a warning when saving phenotype file to avoid overwritting variables
# the script compares the homology curve from a patient against the average homology
# curve for test and for control using the leave-one-out procedure
# As a result, the phenotype file gets a new variable with every patient classified
# for the section used as input
# OUTPUT
# will create additional variable in the phenotype file. The name of the variables
# will use input parameters (for example HER2_17qseg1_sig
# the values are: 1 if the segment was found significant for the patient, 0 otherwise, NA
param <- "B0";
phenotype <- "HER2";
dataSet <- "climent";
subdir <- "sect"
action<- "sect";
arm="17q"
seg="seg2"
outliers <- "no";
###############################
# READ FILES
# Establish the beginning path
#begPath <- "~/Research";
begPath <- "..";
# The source code for the permutations
srcPath <- paste(begPath, "TAaCGH", "functions_sig.R", sep="/");
source(srcPath);
# Read the phenotype data
phenFile <- paste(dataSet, "phen.txt", sep="_");
phenPath <- paste(begPath, "Data", dataSet, phenFile, sep="/");
phenData <- read.table(phenPath, header=TRUE, sep="\t");
print(paste("phenPath",phenPath, sep=" "));
# Read data with homology
dataFile <- paste(param,"_2D_",dataSet,"_",action,"_",arm,"_", seg, ".txt", sep="");
#dataPath <- paste(begPath, "Results", dataSet, action, "2D/Homology",dataFile, sep="/");
dataPath <- paste(begPath, "Results", dataSet, subdir, "2D/Homology",dataFile, sep="/");
print(paste("dataPath homology",dataPath, sep=" "));
# Need to determine the maximum number of columns in the file ahead of time
# because the file is jagged. Not squared
nColData <- max(count.fields(dataPath, sep="\t"));
# Read in parameter data
data <- read.table(dataPath, header=FALSE, sep="\t", fill=TRUE, col.names=1:nColData);
# Fill in the NAs in the jagged array with a parameter appropriate value.
# For B0 and CC that value is 1 because that is the minimum/maximum, respectively.
# For B1 that value is 0.
# For D that value is the end index - start index in chrDict (0-based from cgh_dictionary.py).
if (identical(param,"CC") || identical(param,"B0")) {
fillValue <- 1.0;
data[is.na(data)] <- fillValue;
} else if (identical(param,"B1")) {
fillValue <- 0.0;
data[is.na(data)] <- fillValue;
}
# getting the indices for each phenotype group
phen1indices <- which(phenData[,phenotype] == 1);
phen2indices <- which(phenData[,phenotype] == 0);
phen1num<-length(phen1indices);
phen2num<-length(phen2indices);
print(paste("phenotype", phenotype, sep=" "));
print(paste("There are ", phen1num, " phenotype1", sep=""));
print(paste("There are ", phen2num, " phenotype2", sep=""));
# Initializing variables
phen1indices_noOut<-phen1indices;
phen2indices_noOut<-phen2indices;
phen1num_noOut<-phen1num;
phen2num_noOut<-phen2num;
outlier<-paste("out_",arm, sep="");
# Remove outliers from arm
if (outliers=="yes") {
if ( (outlier %in% colnames(phenData))==TRUE) {
outlierIndx<-which(phenData[,outlier]==1);
phen1indices_noOut<-setdiff(phen1indices, outlierIndx);
phen1num_noOut<-length(phen1indices_noOut);
phen2indices_noOut<-setdiff(phen2indices, outlierIndx);
phen2num_noOut<-length(phen2indices_noOut);
print(paste("After removing outliers in ",outlier,":", sep=""));
print(paste("There are ", phen1num_noOut, " patients for test", sep=""));
print(paste("There are ", phen2num_noOut, " patients for control", sep=""));
}
}
# Separate by phenotype
exp <- data[phen1indices_noOut,];
con <- data[phen2indices_noOut,];
# Initializing the new variable which will indicate if the patient is significant for the segment
phenData$newVar <- NA
sumsExp <- colSums(exp)
AvgExp <- sumsExp / phen1num_noOut
sumsCon <- colSums(con)
AvgCon <- sumsCon / phen2num_noOut
# sum of squares leave-one-out
# first for the patients in the test group
for (i in phen1indices_noOut) {
AvgExpLO <- (sumsExp-data[i,]) / (phen1num_noOut-1)
sumSqVsExp <- sum((AvgExpLO - data[i,])^2)
# phenData$sumSqVsExp[i] <- sum((AvgExpLO - data[i,])^2)
sumSqVsCon <- sum((AvgCon - data[i,])^2)
# phenData$sumSqVsCon[i]<- sum((AvgCon - data[i,])^2)
if (sumSqVsExp > sumSqVsCon) {
# if (phenData$sumSqVsExp[i] > phenData$sumSqVsCon[i]) {
phenData$newVar[i] <- 0
} else {
phenData$newVar[i]<- 1
}
}
# now for the patients in the control group
for (i in phen2indices_noOut) {
sumSqVsExp <- sum((AvgExp - data[i,])^2)
# phenData$sumSqVsExp[i] <- sum((AvgExp - data[i,])^2)
AvgConLO <- (sumsCon -data[i,]) / (phen2num_noOut-1)
sumSqVsCon <- sum((AvgConLO - data[i,])^2)
# phenData$sumSqVsCon[i]<- sum((AvgConLO - data[i,])^2)
if (sumSqVsExp > sumSqVsCon) {
# if (phenData$sumSqVsExp[i] > phenData$sumSqVsCon[i]) {
phenData$newVar[i] <- 0
} else {
phenData$newVar[i]<- 1
}
}
test_stat <- sum(phenData$newVar[phen1indices_noOut])/phen1num_noOut
print(paste( phenotype, "_", arm ,seg, '_sig_perc =', test_stat, sep=""))
ctrl_stat <- sum(phenData$newVar[phen2indices_noOut])/phen2num_noOut
print(paste("no", phenotype, "_", arm ,seg, '_sig_perc =', ctrl_stat, sep=""))
cols <- ncol(phenData)
colnames(phenData)[cols] <- paste(phenotype, "_", arm, seg, "_sig", sep="")
# Update phen file
write.table(phenData, file=phenPath, row.names=F, sep='\t')