forked from GRONINGEN-MICROBIOME-CENTRE/Lifelines_NEXT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStrainphlan_3_NEXT
96 lines (66 loc) · 3.23 KB
/
Strainphlan_3_NEXT
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Strainphlan 3.0 analysis for Lifelines NEXT pilot on Gearshift
Adapted from Biobakery (StrainPhlAn 3.0) by Ranko Gascesa, Trishla Sinha (2021)
# Step 1: After creating consensus-marker files which are the input for StrainPhlAn (.pkl) run the following code:
sbatch ./doMarkerComparisonLLNext.sh species name
Example: sbatch ./doMarkerComparisonLLNext.sh s__Bifidobacterium_bifidum
#This will perform MSA and create .tre files and .aln files for each of the species you feed it in.
# MSA was performed on consensus marker presence in at least in 50 samples
#doMarkerComparisonLLNext.sh consists of:
#!/bin/bash
#SBATCH --mem=32gb
#SBATCH --time=0-07:59:00
#SBATCH --cpus-per-task=8
#SBATCH --open-mode=truncate
# NOTES:
# > $1 is clade name
module purge
ml Anaconda3/5.3.0
# load conda env
source activate /groups/umcg-dag3/tmp01/rgacesa_tools/conda/envs/dag3pipe_v3_conda
mkdir ${1}
strainphlan -s /groups/umcg-dag3/tmp01/NEXT_pilot_results/strainphlan3/*.pkl --output_dir ./${1} --clade ${1} --marker_in_n_samples 50 --sample_with_n_markers 20 --nprocs 8
doMarkerComparisonLLNext.sh (END)
# Step 2: Make distance matrix from MSA file
bash./makeDistMat.sh ./species_directory/species.aln
#Example:
#bash ./makeDistMat.sh ./s__Alistipes_shahii/s__Alistipes_shahii.StrainPhlAn3_concatenated.aln
#!/bin/bash
echo 'maker of distance matrix from multiple alignment'
echo ' feed it with .aln file (multiple alignment)'
echo 'NOTE: make sure conda is loaded'
#Create a distance matrix from a multiple sequence alignment using the EMBOSS package (https://www.bioinformatics.nl/cgi-bin/emboss/help/distmat)
# distmat calculates the evolutionary distance between every pair of sequences in a multiple sequence alignment.
# Uses Kimura Two-Parameter distance (distances expressed in terms of the number of substitutions per 100 b.p or amino acids)
distmat -sequence ${1} -nucmethod 2 -outfile ${1/.aln/.dmat}
# Step 3: Cleaning the distance matrix from MSA file
ml RPlus
#Example: Rscript parseDMat_LLNext.R s__Bifidobacterium_bifidum.dmat
#parseDMat_LLNext.R consists of:
library(optparse)
# CL PARSER
help_description <- ""
args <- OptionParser(usage = "%prog clade.distmat.txt metadata.txt ordination.png",
add_help_option = TRUE, prog='strainphlan_ordination.R',
description=help_description )
args_list <- parse_args(args, positional_arguments=TRUE)
# read in the file, skipping the first 8 rows and filling in empty columns, using the tab as sep, and stripping extra white space
inFile <- args_list$args[1]
data <- read.table( inFile, skip = 8, fill = TRUE, sep="\t", strip.white = T)
# remove the first column of the data as it is blank
data[1] <- NULL
# get the header as the last column of the data as a character vector
header <- lapply(data[,ncol(data)], as.character)
# remove the last column from the data as it has been stored as a header
data[ncol(data)] <- NULL
# remove the current last column from the data as it is blank
data[ncol(data)] <- NULL
# split header by space and digit to get the sample names
samples <- unlist(header)
# fix metaphlan just by taking only first 3 thingies after _ split
ss <- c()
sNR <- 0
for (s in samples) {
sNR <- sNR + 1
ssplit = strsplit(s,' ')
ss <- c(ss,ssplit[[1]][1])
}