-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgwas_nt.R
78 lines (70 loc) · 2.84 KB
/
gwas_nt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# ivan.trus@gmail.com 07/03/2015
# Initialization
start.time <- Sys.time()
cat("\014")
library("seqinr")
# Reading samples (plus.fas) and controls (minus.fas).
# All sequences should have the same length
# (They should be aligned all together before sorting them to different files.)
cat("Reading sequences\n")
seq.minus <- read.alignment(file="minus.fas", format="fasta")
seq.plus <- read.alignment(file="plus.fas", format="fasta")
# Reading intervals
seq.start <- 1 # Start position
#
seq.length <- nchar(seq.minus$seq[[1]]) # End position
#seq.length <- 500
# Calculating the number of the most frequent nucleotide at each position of
# controls
cat("Calculating moda values in negatives\n")
moda.minus <- rep(0, seq.length)
count.moda.minus <- rep(0, seq.length)
allele <- rep("", seq.minus$nb)
for (k in seq.start:(seq.length + seq.start - 1)) {
for (i in 1:seq.minus$nb) {
allele[i] <- substr(seq.minus$seq[[i]], k, k)
}
j <- count(allele, 1)
# Number values are given for each nucleotide (a=1, c=2, g=3, t=4)
moda.minus[k - seq.start + 1] <- which.max(j)
count.moda.minus[k - seq.start + 1] <- j[[moda.minus[k - seq.start + 1]]]
}
# Calculation of frequency of finding the same nucleotide in positive samples
cat("Calculating moda values in positives\n")
count.moda.plus <- rep(0, seq.length)
allele <- rep("", seq.plus$nb)
for (k in seq.start:(seq.length + seq.start - 1)) {
for (i in 1:seq.plus$nb) {
allele[i] <- substr(seq.plus$seq[[i]], k, k)
}
j <- count(allele, 1)
count.moda.plus[k - seq.start + 1]<-j[moda.minus[k - seq.start + 1]]
}
# Calculation of P level (prob) for each position with Fisher's or Chi test
cat("Calculating P values\n")
prob <- rep(1, seq.length)
for (k in seq.start:(seq.length + seq.start - 1)) {
j <- rbind(c(count.moda.plus[k - seq.start + 1],
seq.plus$nb - count.moda.plus[k - seq.start + 1]),
c(count.moda.minus[k - seq.start + 1],
seq.minus$nb - count.moda.minus[k - seq.start + 1]))
# Chi squared test (chisq.test) can be used instead of Fisher's test (fisher.test)
# if all tested values (seq.minus$nb, seq.plus$nb, count.moda.plus,
# count.moda.minus) >=5
prob[k - seq.start + 1] <- chisq.test(as.table(j))$p.value
}
# Plotting of a Manhattan plot with three reference lines
# (P=0.001, P=0.01, P=0.05)
cat("Plotting chart\n")
plot(-log10(prob),
col = ifelse(prob <= 0.05, "red", "black"),
xlab = paste("Site (", seq.start, ":", seq.length + seq.start - 1, ")",
sep = ""),
lwd = 2)
lines(c(1, seq.length + 1), c(-log10(0.05), -log10(0.05)), lwd=1)
lines(c(1, seq.length + 1), c(-log10(0.01), -log10(0.01)), lwd=2)
lines(c(1, seq.length + 1), c(-log10(0.001), -log10(0.001)), lwd=3)
# Plotting of hystogramm of all registered P-values
#hist(-log10(prob), ylim=c(0, 20), breaks=200, freq=T)
# Cleaning of workaround
print(Sys.time()-start.time)