-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path51_make_r_raw_kmer_trees_rresults.txt
82 lines (49 loc) · 2.39 KB
/
51_make_r_raw_kmer_trees_rresults.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
> ## this script makes k-mer trees directly from DNA database
>
> library(ape)
> library(kmer)
> library(shipunov)
package 'shipunov', version 1.5-1
> DATE <- format(Sys.time(), "%Y%m%d_%H%M%S") # timestamp
> dna <- read.table("_kubricks_dna.txt", h=TRUE, sep="\t", as.is=TRUE) # read database
> ## convert database into FASTA, write it on disc and read again as "DNAbin" object
> raw <- dna
> raw$ids.tmp <- apply(raw[, c("FRAGMENT", "SPECIES.NEW", "SEQUENCE.ID", "SELECT")], 1, function(.x) (paste(.x, collapse="_")))
> Write.fasta(raw[, c("ids.tmp", "SEQUENCE")], file="40_concatenated/raw.fasta")
> raw <- read.dna("40_concatenated/raw.fasta", format="fasta")
> ## now build the k-mer tree from whole data inculding sequences which were de-selected
> cat("raw k-mer...\n")
raw k-mer...
> raw.tree <- nj(kdistance(raw, k=7))
> raw.tree$edge.length[raw.tree$edge.length < 0] <- 0.00001 # this is to increase visibility, found experimentally
> raw.tree$edge.length <- (raw.tree$edge.length + 0.005) * 2 # this is to increase visibility, found experimentally
> ## plot tree into PDF
> pdf(paste0("50_technical_trees/", DATE, "_raw_kmer_nj_kubricks.pdf"), width=12, height=12) # change PDF size if needed
> oldpar <- par(mar=rep(0, 4))
> plot(raw.tree, no.margin=TRUE)
> mtext("k-mer: raw, all", font=2, line=-3)
> par(oldpar)
> dev.off()
null device
1
> ## this is raw data minus de-selected sequences
> ## make FASTA
> rawd <- dna[dna$SELECT == 1, ] # remove deselected
> rawd$ids.tmp <- apply(rawd[, c("FRAGMENT", "SPECIES.NEW", "SEQUENCE.ID", "SELECT")], 1, function(.x) (paste(.x, collapse="_")))
> Write.fasta(rawd[, c("ids.tmp", "SEQUENCE")], file="40_concatenated/rawd.fasta")
> rawd <- read.dna("40_concatenated/rawd.fasta", format="fasta")
> ## build the tree
> cat("rawd k-mer...\n")
rawd k-mer...
> rawd.tree <- nj(kdistance(rawd, k=7))
> rawd.tree$edge.length[rawd.tree$edge.length < 0] <- 0.00001 # this is to increase visibility, found experimentally
> rawd.tree$edge.length <- (rawd.tree$edge.length + 0.005) * 2 # this is to increase visibility, found experimentally
> ## save tree as PDF
> pdf(paste0("50_technical_trees/", DATE, "_rawd_kmer_nj_kubricks.pdf"), width=12, height=12) # change PDF size if needed
> oldpar <- par(mar=rep(0, 4))
> plot(rawd.tree, no.margin=TRUE)
> mtext("k-mer: all minus deselected", font=2, line=-3)
> par(oldpar)
> dev.off()
null device
1