Skip to content

Commit

Permalink
version 3.21
Browse files Browse the repository at this point in the history
  • Loading branch information
stuber committed May 9, 2024
1 parent b27b389 commit e6ec92e
Show file tree
Hide file tree
Showing 26 changed files with 117 additions and 65 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Tested with Python 3.8 - 3.9.
Anaconda [setup](./docs/instructions/conda_instructions.md)

```
conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.20
conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.21
```

## Installation test
Expand Down
12 changes: 10 additions & 2 deletions bin/vsnp3_alignment_vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import subprocess
Expand Down Expand Up @@ -145,7 +145,15 @@ def qual_value_update(vcf):
return(qp100)

filtered_hapall = f'{sample_name}_filtered_hapall_nanopore.vcf'
bcftools_mpileup = f'bcftools mpileup --threads 16 -Ou -f {reference} {nodup_bamfile} | bcftools call --threads 16 -mv -v -Ov -o {unfiltered_hapall}'
if os.path.exists(os.path.realpath('/project/scratch/singularity')):
print('Downloading bcftools.sif')
os.system(
'singularity pull --name /project/scratch/singularity/bcftools.sif docker://biocontainers/bcftools:v1.9-1-deb_cv1')
else:
print('Will attempt to run bcftools from conda environment')
bcftools_path = os.path.realpath('/project/scratch/singularity/bcftools.sif')
bcftools_mpileup = f'singularity exec {bcftools_path} bcftools mpileup --threads 16 -Ou -f {reference} {nodup_bamfile} | singularity exec {bcftools_path} bcftools call --threads 16 -mv -v -Ov -o {unfiltered_hapall}'
# bcftools_mpileup = f'bcftools mpileup --threads 16 -Ou -f {reference} {nodup_bamfile} | bcftools call --threads 16 -mv -v -Ov -o {unfiltered_hapall}'
vcffilter = f'vcffilter -f "QUAL > 20" {unfiltered_hapall} > temp1.vcf'
os.system(bcftools_mpileup)
os.system(vcffilter)
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_annotation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import shutil
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_assembly.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_best_reference_sourmash.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_bruc_mlst.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import io
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_download_GCA_fasta_get_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "0.0.1"
__version__ = "3.21"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_download_fasta_gbk_gff_by_acc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import argparse
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_excel_merge_defining_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "0.0.1"
__version__ = "3.21"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_fasta_to_snps_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_fastq_stats_seqkit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_file_setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import shutil
Expand Down
10 changes: 6 additions & 4 deletions bin/vsnp3_group_on_defining_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import sys
Expand Down Expand Up @@ -377,11 +377,13 @@ def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, de
def group_selection(self, abs_pos):
sample_dict={}
group_found = False
abs_pos = abs_pos.split(", ")
for sample, single_df in self.dataframe_essentials.items():
if any(single_df['abs_pos'] == abs_pos):
if set(abs_pos).issubset(single_df['abs_pos']): # all positions must be in dataframe column
group_found = True
sample_dict[sample] = self.dataframe_essentials[sample]
elif "!" in abs_pos:
elif "!" in ''.join(abs_pos):
abs_pos = ''.join(abs_pos) # if inverted positions are used there should only be one position in the abs_pos list so it is being reverted back.
abs_pos_inverted = re.sub('!', '', abs_pos)
if not any(single_df['abs_pos'] == abs_pos_inverted):
group_found = True
Expand Down Expand Up @@ -452,7 +454,7 @@ def make_groupings(self, group_sample_dict):
if self.debug:
print(f'\n\t#####\n\t##### {e}, Sample: {sample}\n\t#####\n')
#change alt to N if QUAL 50 - 150
sample_df.loc[(sample_df['QUAL'] >= self.n_threshold) & (sample_df['QUAL'] < self.qual_threshold), 'ALT'] = 'N' # this will overwrite ambigious calls
sample_df.loc[(sample_df['QUAL'] >= self.n_threshold) & (sample_df['QUAL'] < self.qual_threshold) & (sample_df['ALT'] != '-'), 'ALT'] = 'N' # this will overwrite ambigious calls
# < 50 will default to REF... change ALT to REF
try:
sample_df.loc[sample_df['REF'].str.len() > 1, 'REF'] = 'N' #if REF call is indel change to N to maintain equal sequence length for all samples
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_group_reporter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import io
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_html_step2_summary.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os

Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_kernel_plots.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "0.0.1"
__version__ = "3.21"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_path_adder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import glob
Expand Down
48 changes: 26 additions & 22 deletions bin/vsnp3_reference_options.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import sys
Expand Down Expand Up @@ -36,6 +36,7 @@ def __init__(self, select_ref=None):
self.gbk = None
self.remove = None
self.fasta = None
self.select_ref = select_ref
all_ref_options = []
script_path = os.path.dirname(os.path.realpath(__file__))
self.script_path = script_path
Expand All @@ -51,27 +52,30 @@ def __init__(self, select_ref=None):
all_ref_options = [x for x in all_ref_options if os.path.isdir(x)] #only capture directories
self.all_ref_options = all_ref_options

#if select_ref is a directory name then use files in it
for directory in all_ref_options:
if select_ref == directory.split('/')[-1]: #find reference by directory name
self.select_ref = select_ref
#if the asked for reference is a match grab files
self.path = directory
self.metadata_gather(directory)
continue
else:
reference_header_capture = defaultdict(list) # find reference by fasta headers
# for directory in all_ref_options:
for each_fasta in glob.glob(f'{directory}/*fasta'):
with open(each_fasta, 'r') as each_fasta:
for each_line in each_fasta:
if each_line.startswith(">"):
reference_header_capture[directory].append(each_line.strip())
for directory, header in reference_header_capture.items():
if select_ref in " ".join(header):
self.select_ref = select_ref
self.path = directory
self.metadata_gather(directory)
if os.path.isdir(select_ref): # -t directory name with full path
print("Directory is a path on the file system")
self.metadata_gather(select_ref)
else:
#if select_ref is a directory name then use files in it
for directory in all_ref_options:
if select_ref == directory.split('/')[-1]: #find reference by directory name
#if the asked for reference is a match grab files
self.path = directory
self.metadata_gather(directory)
continue
else: # this portion should probably be removed. It can cause confusion as to why a reference is found and being used.
reference_header_capture = defaultdict(list) # find reference by fasta headers
# for directory in all_ref_options:
for each_fasta in glob.glob(f'{directory}/*fasta'):
with open(each_fasta, 'r') as each_fasta:
for each_line in each_fasta:
if each_line.startswith(">"):
reference_header_capture[directory].append(each_line.strip())
for directory, header in reference_header_capture.items():
if select_ref in " ".join(header):
self.select_ref = select_ref
self.path = directory
self.metadata_gather(directory)

def metadata_gather(self, directory):
defining_snps = glob.glob(f'{directory}/*xlsx')
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_remove_from_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_spoligotype.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import gzip
Expand Down
26 changes: 23 additions & 3 deletions bin/vsnp3_step1.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import sys
Expand Down Expand Up @@ -42,7 +42,15 @@ def __init__(self, SAMPLE_NAME=None, FASTA=None, FASTQ_R1=None, FASTQ_R2=None, g
self.reference_type = None
with open(self.FASTA) as f:
self.top_header_description = f.readline()
elif reference_type: #IF -n OPTION USE IT, ie directory name
# elif os.path.isdir(reference_type): # -t directory name with full path
# reference_options = Ref_Options(reference_type)
# concatenated_FASTA = self.concat_fasta(reference_options.fasta)
# self.excel_stats.excel_dict["Reference"] = f'{reference_type} Forced'
# Setup.__init__(self, SAMPLE_NAME=SAMPLE_NAME, FASTA=concatenated_FASTA, FASTQ_R1=FASTQ_R1, FASTQ_R2=FASTQ_R2, gbk=reference_options.gbk, debug=debug)
# self.reference_type = reference_type
# with open(self.FASTA) as f:
# self.top_header_description = f.readline()
elif reference_type: # -t directory name
reference_options = Ref_Options(reference_type)
concatenated_FASTA = self.concat_fasta(reference_options.fasta)
self.excel_stats.excel_dict["Reference"] = f'{reference_type} Forced'
Expand Down Expand Up @@ -252,7 +260,8 @@ def run(self,):
parser.add_argument('-f', '--FASTA', nargs='*', dest='FASTA', required=False, help='FASTA file to be used as reference. Multiple can be specified with wildcard')
parser.add_argument('-b', '--gbk', nargs='*', dest='gbk', required=False, default=None, help='Optional: gbk to annotate VCF file. Multiple can be specified with wildcard')
parser.add_argument('-t', '--reference_type', action='store', dest='reference_type', required=False, default=None, help="Optional: Provide directory name with FASTA and GBK file/s")
parser.add_argument('-o', '--nanopore', action='store_true', dest='nanopore', default=False, help='if true run alignment optimized for nanopore reads')
parser.add_argument('-p', '--nanopore', action='store_true', dest='nanopore', default=False, help='if true run alignment optimized for nanopore reads')
parser.add_argument('-o', '--output_dir', action='store', dest='output_dir', required=False, default=None, help="Optional: Provide a name. This name will be a directory output files are writen to. Name can be a directory path, but doesn't have to be.")
parser.add_argument('-assemble_unmap', '--assemble_unmap', action='store_true', dest='assemble_unmap', help='Optional: skip assembly of unmapped reads. See also vsnp3_assembly.py')
parser.add_argument('-spoligo', '--spoligo', action='store_true', dest='spoligo', help='Optional: get spoligotype if TB complex. See also vsnp3_spoligotype.py')
parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='keep spades output directory')
Expand All @@ -269,6 +278,17 @@ def run(self,):
if hasattr(module, '__version__') and name in program_list:
python_programs.append(f'{name}, {module.__version__}')

if args.output_dir:
output_dir = args.output_dir
output_dir = os.path.expanduser(output_dir)
output_dir = os.path.abspath(output_dir)
os.makedirs(output_dir, exist_ok=True)
if args.FASTQ_R1:
shutil.copy(args.FASTQ_R1, output_dir)
if args.FASTQ_R2:
shutil.copy(args.FASTQ_R2, output_dir)
os.chdir(output_dir)

vsnp = vSNP3_Step1(SAMPLE_NAME=args.SAMPLE_NAME, FASTQ_R1=args.FASTQ_R1, FASTQ_R2=args.FASTQ_R2, FASTA=args.FASTA, gbk=args.gbk, reference_type=args.reference_type, nanopore=args.nanopore, assemble_unmap=args.assemble_unmap, spoligo=args.spoligo, debug=args.debug)
vsnp.run()

Expand Down
44 changes: 31 additions & 13 deletions bin/vsnp3_step2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.20"
__version__ = "3.21"

import os
import sys
Expand Down Expand Up @@ -294,7 +294,8 @@ def __init__(self, runtime=None, vcf_to_df=None, reference=None, groupings_dict=
'''), epilog='''---------------------------------------------------------''')

parser.add_argument('-wd', '--wd', action='store', dest='wd', required=False, default='.', help='Optional: path to VCF files. By default .vcf in current working directory are used')
parser.add_argument('-wd', '--wd', action='store', dest='wd', required=False, default='.', help='Optional: path to VCF files. By default .vcf in current working directory are used.')
parser.add_argument('-o', '--output', action='store', dest='output_dir', required=False, default=None, help="Optional: Provide a name. This name will be a directory output files are writen to. Name can be a directory path, but doesn't have to be. By default VCF files are worked on in your current working directory")
parser.add_argument('-t', '--reference_type', action='store', dest='reference_type', default=None, required=False, help='Optional: A valid reference_type name will be automatically found, but a valid reference_type name can be supplied. See vsnp3_path_adder.py -s')
parser.add_argument('-b', '--gbk', nargs='*', dest='gbk', required=False, default=None, help='Optional: gbk to annotate VCF file. Multiple gbk files can be specified with wildcard')
parser.add_argument('-s', '--defining_snps', action='store', dest='defining_snps', default=None, required=False, help='Optional: Defining SNPs with positions to filter. See template_define_filter.xlsx in vsnp dependency folder. Recommended having this file in reference type folder')
Expand All @@ -321,8 +322,13 @@ def __init__(self, runtime=None, vcf_to_df=None, reference=None, groupings_dict=
if args.wd == '.':
cwd_test = True
vcf_list = glob.glob('*vcf')
wd_vcf_list = vcf_list
else:
vcf_list = glob.glob(f'{args.wd}/*vcf')
#get VCFs from a directory
wd = os.path.expanduser(args.wd)
wd = os.path.abspath(wd)
vcf_list = glob.glob(f'{wd}/*vcf')
wd_vcf_list = vcf_list

def zipit(src, dst):
zf = zipfile.ZipFile("%s.zip" % (dst), "w", zipfile.ZIP_DEFLATED)
Expand All @@ -335,12 +341,25 @@ def zipit(src, dst):
zf.close()
shutil.rmtree(src)

if args.output_dir:
wd_vcf_list = []
output_dir = args.output_dir
output_dir = os.path.expanduser(output_dir)
output_dir = os.path.abspath(output_dir)
os.makedirs(output_dir, exist_ok=True)
for each_vcf in vcf_list:
shutil.copy(each_vcf, output_dir)
wd_vcf_list.append(f'{output_dir}/{os.path.basename(each_vcf)}')
os.chdir(output_dir)
setup.cwd = os.getcwd()
global_working_dir = setup.cwd

starting_files = f'{setup.cwd}/vcf_starting_files'
os.makedirs(starting_files)
for each_vcf in vcf_list:
for each_vcf in wd_vcf_list:
shutil.copy(each_vcf, starting_files)

vcf_to_df = VCF_to_DF(vcf_list=vcf_list, debug=args.debug) #write_out=args.write_out
vcf_to_df = VCF_to_DF(vcf_list=wd_vcf_list, debug=args.debug) #write_out=args.write_out
if args.fix_vcfs:
sys.exit(0)

Expand Down Expand Up @@ -386,14 +405,13 @@ def zipit(src, dst):
group = Group(cwd=global_working_dir, metadata=args.metadata, defining_snps=args.defining_snps, excel_remove=args.remove_by_name, gbk_list=args.gbk, dataframes=vcf_to_df.dataframes, all_vcf=args.all_vcf, find_new_filters=args.find_new_filters, no_filters=args.no_filters, qual_threshold=int(args.qual_threshold), n_threshold=int(args.n_threshold), mq_threshold=int(args.mq_threshold), abs_pos=args.abs_pos, group=args.group, debug=args.debug)
vcf_to_df.vcf_bad_list = vcf_to_df.vcf_bad_list + group.vcf_bad_list

#by default the VCF files used are not deleted. They are only deleted when using --remove option AND the files were ran from the current working directory. ie the --wd option was not used.:
if not args.keep_ind_vcfs and cwd_test:
for each_vcf in vcf_list:
try:
os.remove(each_vcf)
except FileNotFoundError:
# if file was previously removed such as it was empty
pass
#rm move vcfs from working directory
for each_vcf in wd_vcf_list:
try:
os.remove(each_vcf)
except FileNotFoundError:
# if file was previously removed such as it was empty
pass

setup.print_time()
HTML_Summary(runtime=setup.run_time, vcf_to_df=vcf_to_df, reference=ro.select_ref, groupings_dict=group.groupings_dict, raxml_version=group.raxml_version, all_vcf_boolen=args.all_vcf, args=args, removed_samples=remove_list)
Expand Down
Loading

0 comments on commit e6ec92e

Please sign in to comment.