-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfastQCS3_pkl.py
158 lines (128 loc) · 6.36 KB
/
fastQCS3_pkl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import dash
import dash_core_components as dcc
import dash_html_components as html
import pickle
import subprocess
from py_files import make_plots
from py_files import data_prep_stack_barplots as prep
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from qiime2.plugins import feature_table
from qiime2 import Artifact
# print statements for user intro to software
print('')
print('WELCOME TO fastQCS3! Before you begin:\n',
'\n',
'1. Please make sure your .fastq.gz files are in a directory.\n',
'2. If your sequences are still multiplexed:',
'make sure your barcodes.fastq.gz file lives in the same directory as your sequences.\n',
'3. Make sure your metadata file is in the metadata directory and,\n',
'4. Make sure you know your metadata file name.\n')
demux_status = input('Are your fastq.gz sequence file(s) demultiplexed? (y/n):')
if demux_status == 'y' or 'n':
pass
else:
raise NameError('Please enter either y or n')
def import_demuxed_data(directory):
"""function to run importing of pre-demultiplexed reads"""
subprocess.run(['bash','-c','bash shell_scripts/auto_import.sh '+directory])
return
def auto_demux(directory, metadata):
"""function to run importing and demultiplexing (demux) of multiplexed reads"""
subprocess.run(['bash','-c','bash shell_scripts/auto_demux.sh '+directory+' '+metadata])
return
def auto_dada2(trimlength, metadata):
"""function to run dada2"""
subprocess.run(['bash','-c','bash shell_scripts/auto_dada2.sh '+trimlength+' '+metadata])
return
def auto_classify_phylo(sample_n_features, metadata):
"""function for classification, phylogenetic analysis, outputs data in appropriate form to work with for plotting"""
subprocess.run(['bash','-c','bash shell_scripts/auto_classify_phylo.sh '+sample_n_features+' '+metadata])
return
# prompt user to input directory path
directory = input('Please enter the name of your directory of .fastq.gz files (note: sequence data must be gzipped):')
# adding error statements
if (' ' in directory):
raise TypeError('Cannot have spaces in directory name')
elif ('.' in directory):
raise TypeError('Please avoid periods in directory name to avoid confusion')
# prompting user to add their metadata file
metadata = input('Please enter your complete (ie. filename.tsv) metadata file name (file must exist in metadata directory):')
# adding error statements
if (' ' in metadata):
raise TypeError('Cannot have spaces in filename')
# calling importing functions based on user input
if demux_status == 'n':
auto_demux(directory, metadata)
elif demux_status == 'y':
import_demuxed_data(directory)
# calling find_dropoff function to print information about sequence quality by position
# so that the user can choose their trimlength logically
make_plots.find_dropoff('data/exported_demux/', 500)
# prompting user to input trim length
trimlength = input('\nPlease choose a sequencing trim length:')
# adding error statements
if trimlength.isdigit():
pass
else:
raise TypeError('trim length input must be a positive integer')
# this second block will run dada2 and the following few commands
print('\n...running dada2...this may take a few minutes...')
auto_dada2(trimlength, metadata)
# calling get_feature_info to get some information on feature counts
# to let the user choose sampling depth
make_plots.get_feature_info('data/features/sample-frequency-detail.csv')
# prompting user to input sampling depth
sample_n_features = input('\nPlease choose a sampling depth:')
# adding error statements
if sample_n_features.isdigit():
pass
else:
raise TypeError('sampling depth input must be a positive integer')
print('\n...building phylogenetic tree and classifying ASVs...this may take a few minutes...')
# calling auto_classify_phylo to run remainder of commands
auto_classify_phylo(sample_n_features, metadata)
print('\n',
'fastQCS3 has finished processing your data! Congrats!')
# prompting user to name their .pkl file
basename = input('\nPlease give your visualization file a name:')
# adding error statements
if (' ' in basename):
raise TypeError('Cannot have spaces in filename')
elif ('.' in basename):
raise TypeError('Please avoid periods in filename to avoid file type confusion')
print('\n...packaging your visualizations...this may take a minute...')
# everything below is for creating the plotting objects
"""read in newly created taxonomy data file to pandas"""
taxonomy = pd.read_csv("data/taxonomy.tsv", sep='\t')
taxonomy[['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']] = taxonomy['Taxon'].str.split(';', expand=True)
taxonomy.set_index('Feature ID', inplace=True)
taxonomy.shape
"""reads in table.qza file from qiime2 into DataFrame"""
unrarefied_table = Artifact.load('outputs/table.qza')
rarefy_result = feature_table.methods.rarefy(table=unrarefied_table, sampling_depth=100)
rarefied_table = rarefy_result.rarefied_table
table = rarefied_table.view(pd.DataFrame)
"""pre process data into dataframes for plotting taxonomy relative abundances in stacked barplots"""
kingdom_df, phylum_df, class_df, order_df, family_df, genus_df, species_df = prep.prepare_data_stacked_barplots(table, taxonomy)
"""create plotly figures"""
king_plot = make_plots.plotly_stacked_barplot(kingdom_df, 'Kingdom Relative Abundances')
phy_plot = make_plots.plotly_stacked_barplot(phylum_df, 'Phylum Relative Abundances')
class_plot = make_plots.plotly_stacked_barplot(class_df, 'Class Relative Abundances')
ord_plot = make_plots.plotly_stacked_barplot(order_df, 'Order Relative Abundances')
fam_plot = make_plots.plotly_stacked_barplot(family_df, 'Family Relative Abundances')
gen_plot = make_plots.plotly_stacked_barplot(genus_df, 'Genus Relative Abundances')
spec_plot = make_plots.plotly_stacked_barplot(species_df, 'Species Relative Abundances')
qual_plot = make_plots.plot_qualities('data/exported_demux/', 500)
# qual_plot = make_plots.plot_qualities(directory, 500)
qual_hist = make_plots.quality_hist()
# Loading all plot files into a pkl file
filename = basename + '.pkl'
with open(filename, 'wb') as f:
pickle.dump([king_plot, phy_plot, class_plot, ord_plot, fam_plot, gen_plot, spec_plot, qual_plot, qual_hist], f)
print('\n',
'Now please run the following command to visualize your data in dash!\n',
'\n',
'python fastQCS3_dashboard.py',
'\n')