vsnp3 version 3.24

USDA-VS · Aug 22, 2024 · 69745d0 · 69745d0
1 parent f461495
commit 69745d0
Show file tree

Hide file tree

Showing 27 changed files with 86 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ Tested with Python 3.8 - 3.9.
 Anaconda [setup](./docs/instructions/conda_instructions.md)
 
 ```
-conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.23
+conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.24
 ```
 
 ## Installation test

diff --git a/bin/vsnp3_alignment_vcf.py b/bin/vsnp3_alignment_vcf.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import subprocess
@@ -336,7 +336,7 @@ def latex(self, tex, groups=None):
         print(r'\end{adjustbox}', file=tex)
         print(r'\begin{adjustbox}{width=1\textwidth}', file=tex)
 
-        print(r'\begin{tabular}{ l | l | l | l | l | l }', file=tex)
+        print(r'\begin{tabular}{ l | l | l | l | l | l | l }', file=tex)
         print(r'Mapped Paired Reads & Mapped Single Reads & Unmapped Reads & Unmapped Percent & \multicolumn{2}{l}{Unmapped Assembled Contigs} \\', file=tex)
         print(r'\hline', file=tex) 
         mapped_reads = self.READS_PAIRED + self.READS_SINGLE
@@ -346,16 +346,16 @@ def latex(self, tex, groups=None):
         print(r'\hline', file=tex)
         print(r'\hline', file=tex)
 
-        print(r'Duplicate Paired Reads & Duplicate Single Reads & \multicolumn{4}{l}{Duplicate Percent of Mapped Reads} \\', file=tex)
+        print(r'Duplicate Paired Reads & Duplicate Single Reads & \multicolumn{5}{l}{Duplicate Percent of Mapped Reads} \\', file=tex)
         print(r'\hline', file=tex)
-        print(f'{self.DUPLICATE_PAIR:,} & {self.DUPLICATE_SINGLE:,} & ' + r'\multicolumn{4}{l}{' + f'{(self.DUPLICATION_RATIO*100):,.1f}' + r'\%} \\', file=tex)
+        print(f'{self.DUPLICATE_PAIR:,} & {self.DUPLICATE_SINGLE:,} & ' + r'\multicolumn{5}{l}{' + f'{(self.DUPLICATION_RATIO*100):,.1f}' + r'\%} \\', file=tex)
         print(r'\hline', file=tex)
         print(r'\hline', file=tex)
 
-        print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Quality SNPs \\\\', file=tex)
+        print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Ambiguous SNPs & Quality SNPs \\\\', file=tex)
         print(r'\hline', file=tex)
         bam = self.zero_coverage.bam.replace('_', '\_')
-        print(f'{bam} & {self.zero_coverage.reference_length:,} & {(self.zero_coverage.genome_coverage*100):,.2f}\% & {self.zero_coverage.ave_coverage:,.1f}X & {self.zero_coverage.total_zero_coverage:,} & {self.zero_coverage.good_snp_count:,} \\\\', file=tex)
+        print(f'{bam} & {self.zero_coverage.reference_length:,} & {(self.zero_coverage.genome_coverage*100):,.2f}\% & {self.zero_coverage.ave_coverage:,.1f}X & {self.zero_coverage.total_zero_coverage:,} & {self.zero_coverage.ac1_count:,} & {self.zero_coverage.good_snp_count:,} \\\\', file=tex)
         print(r'\hline', file=tex)
 
         if groups:
@@ -386,6 +386,7 @@ def excel(self, excel_dict):
         excel_dict['Average Depth'] = f'{self.zero_coverage.ave_coverage:,.1f}X'
         excel_dict['No Coverage Bases'] = f'{self.zero_coverage.total_zero_coverage:,}'
         excel_dict['Percent Ref with Zero Coverage'] = f'{self.zero_coverage.percent_ref_with_zero_coverage:,.6f}%'
+        excel_dict['Ambiguous SNPs'] = f'{self.zero_coverage.ac1_count:,}'
         excel_dict['Quality SNPs'] = f'{self.zero_coverage.good_snp_count:,}'
 
 

diff --git a/bin/vsnp3_annotation.py b/bin/vsnp3_annotation.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import shutil

diff --git a/bin/vsnp3_assembly.py b/bin/vsnp3_assembly.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys

diff --git a/bin/vsnp3_best_reference_sourmash.py b/bin/vsnp3_best_reference_sourmash.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import subprocess

diff --git a/bin/vsnp3_bruc_mlst.py b/bin/vsnp3_bruc_mlst.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import io

diff --git a/bin/vsnp3_download_GCA_fasta_get_metadata.py b/bin/vsnp3_download_GCA_fasta_get_metadata.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys

diff --git a/bin/vsnp3_download_fasta_gbk_gff_by_acc.py b/bin/vsnp3_download_fasta_gbk_gff_by_acc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import argparse

diff --git a/bin/vsnp3_excel_merge_defining_snps.py b/bin/vsnp3_excel_merge_defining_snps.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import re

diff --git a/bin/vsnp3_fasta_to_fastq.py b/bin/vsnp3_fasta_to_fastq.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import gzip
 import os

diff --git a/bin/vsnp3_fasta_to_snps_table.py b/bin/vsnp3_fasta_to_snps_table.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import subprocess
@@ -312,7 +312,8 @@ def excel_formatter(self, df_json, write_to, group=None):
         # sample_path_name = self.sample_path_name
         st = self.st
         table_df = pd.read_json(df_json, orient='split')
-        writer = pd.ExcelWriter(write_to, engine='xlsxwriter')
+        writer = pd.ExcelWriter(write_to, engine='xlsxwriter',)
+        writer.book.use_zip64()
         table_df.to_excel(writer, sheet_name='Sheet1')
         wb = writer.book
         ws = writer.sheets['Sheet1']

diff --git a/bin/vsnp3_fastq_stats_seqkit.py b/bin/vsnp3_fastq_stats_seqkit.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import subprocess

diff --git a/bin/vsnp3_file_setup.py b/bin/vsnp3_file_setup.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import shutil

diff --git a/bin/vsnp3_group_on_defining_snps.py b/bin/vsnp3_group_on_defining_snps.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys
@@ -370,8 +370,11 @@ def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, de
         # print(f'\n\nTotal Time: {datetime.now() - self.beginTime}\n')
 
         #Add back those that where a group was not found
+        if 'Group Not Found' not in groupings_dict:
+            groupings_dict['Group Not Found'] = {}
+
         for sample in samples_without_group_set:
-            groupings_dict = {**groupings_dict, 'Group Not Found': {sample: None}}
+            groupings_dict['Group Not Found'][sample] = pd.DataFrame()
         self.groupings_dict = groupings_dict # will be passed to html summary
 
     def group_selection(self, abs_pos):

diff --git a/bin/vsnp3_group_reporter.py b/bin/vsnp3_group_reporter.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import io
@@ -100,6 +100,7 @@ def bin_and_html_table(self, filename, found_positions, found_positions_mix):
             defining_snps = self.defining_snps
             inverted_defining_snps = self.inverted_defining_snps
             try:
+                sample_groups_list = []
                 defining_snp = False
                 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())): #absolute positions in set union of two list
                     group = defining_snps[abs_position]
@@ -112,10 +113,21 @@ def bin_and_html_table(self, filename, found_positions, found_positions_mix):
                         group = inverted_defining_snps[abs_position]
                         sample_groups_list.append(group)
                         defining_snp = True
-                if defining_snp:
-                    sample_groups_list = sorted(sample_groups_list)
-                else:
+
+                if defining_snp is False: # extra step to get the group name when there are mutliple defining snps for a group.
+                    for abs_position in list(defining_snps.keys()):
+                        set_abs_position = set(abs_position.split(", "))
+                        set_found_positions = set(found_positions.keys())
+                        is_subset = set_abs_position.issubset(set_found_positions)
+                        if is_subset:
+                            group = defining_snps[abs_position]
+                            sample_groups_list.append(group)
+
+                if len(sample_groups_list) == 0:
                     sample_groups_list = ['No defining SNPs']
+                else:
+                    sample_groups_list = sorted(sample_groups_list)
+
             except TypeError:
                 message = f'File TypeError'
                 print(f'{message}: {filename}')

diff --git a/bin/vsnp3_html_step2_summary.py b/bin/vsnp3_html_step2_summary.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 

diff --git a/bin/vsnp3_kernel_plots.py b/bin/vsnp3_kernel_plots.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import re

diff --git a/bin/vsnp3_path_adder.py b/bin/vsnp3_path_adder.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import glob

diff --git a/bin/vsnp3_reference_options.py b/bin/vsnp3_reference_options.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys

diff --git a/bin/vsnp3_remove_from_analysis.py b/bin/vsnp3_remove_from_analysis.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys

diff --git a/bin/vsnp3_spoligotype.py b/bin/vsnp3_spoligotype.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import gzip
@@ -163,7 +163,7 @@ def spoligo(self):
         count_summary = pull.compute()
         count_summary = OrderedDict(sorted(count_summary.items()))
         spoligo_binary_dictionary = {}
-        self.call_cut_off = 4
+        self.call_cut_off = 2
         for k, v in count_summary.items():
             if v > self.call_cut_off:
                 spoligo_binary_dictionary.update({k: 1})

diff --git a/bin/vsnp3_step1.py b/bin/vsnp3_step1.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys
@@ -132,7 +132,7 @@ def run(self,):
         except AttributeError:
             pass
         self.MYCO = MYCO
-        if MYCO and self.spoligo:
+        if self.spoligo:
             spoligo = Spoligo(SAMPLE_NAME=self.sample_name, FASTQ_R1=self.FASTQ_R1, FASTQ_R2=self.FASTQ_R2, debug=self.debug)
             spoligo.spoligo()
             spoligo.latex(self.latex_report.tex)

diff --git a/bin/vsnp3_step2.py b/bin/vsnp3_step2.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import sys
@@ -218,7 +218,10 @@ def __init__(self, runtime=None, vcf_to_df=None, reference=None, groupings_dict=
             print("<tr>", file=htmlfile)
             print(f"<td>{key}</td>", end='\t', file=htmlfile)
             for group in value:
-                print(f"<td>{group}</td>", end='\t', file=htmlfile)
+                if group == "Group Not Found":
+                    print(f'<td><span style="color: red;">{group}</span></td>', end='\t', file=htmlfile)
+                else:
+                    print(f"<td>{group}</td>", end='\t', file=htmlfile)
             print("</tr>", file=htmlfile)
         print("</table>", file=htmlfile)
 

diff --git a/bin/vsnp3_table_compare.py b/bin/vsnp3_table_compare.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import re

diff --git a/bin/vsnp3_vcf_annotation.py b/bin/vsnp3_vcf_annotation.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import re

diff --git a/bin/vsnp3_vcf_merge_to_fasta.py b/bin/vsnp3_vcf_merge_to_fasta.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import re

diff --git a/bin/vsnp3_zero_coverage.py b/bin/vsnp3_zero_coverage.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.23"
+__version__ = "3.24"
 
 import os
 import re
@@ -22,6 +22,28 @@ class Zero_Coverage(Setup):
     '''
     def __init__(self, FASTA=None, bam=None, vcf=None, debug=False):
 
+        def count_ac1_positions(vcf_file):
+            ac1_count = 0
+
+            with open(vcf_file, 'r') as f:
+                for line in f:
+                    # Skip header lines
+                    if line.startswith('#'):
+                        continue
+
+                    # Split the line into fields
+                    fields = line.strip().split('\t')
+
+                    # Check if INFO field contains AC=1
+                    info = fields[7]
+                    if 'AC=1' in info.split(';'):
+                        ac1_count += 1
+
+            return ac1_count
+
+        self.ac1_count = count_ac1_positions(vcf)
+        print(f"Number of positions with AC=1: {self.ac1_count:,}")
+
         Setup.__init__(self, FASTA=FASTA, debug=debug)
         self.print_run_time('Zero Coverage')
         self.sample_name = re.sub('[_.].*', '', bam)
@@ -116,11 +138,11 @@ def latex(self, tex):
         print(r'\end{center}', file=tex)
         print(r'\end{adjustbox}', file=tex)
         print(r'\begin{adjustbox}{width=1\textwidth}', file=tex)
-        print(r'\begin{tabular}{ l | l | l | l | l | l | l }', file=tex)
-        print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Quality SNPs \\\\', file=tex)
+        print(r'\begin{tabular}{ l | l | l | l | l | l | l | l }', file=tex)
+        print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Ambiguous SNPs & Quality SNPs \\\\', file=tex)
         print(r'\hline', file=tex)
         bam = self.bam.replace('_', '\_')
-        print(f'{bam} & {self.reference_length:,} & {(self.genome_coverage*100):,.2f}\% & {self.ave_coverage:,.1f}X & {self.total_zero_coverage:,} & {self.good_snp_count:,} \\\\', file=tex)
+        print(f'{bam} & {self.reference_length:,} & {(self.genome_coverage*100):,.2f}\% & {self.ave_coverage:,.1f}X & {self.total_zero_coverage:,} & {self.ac1_count:,} & {self.good_snp_count:,} \\\\', file=tex)
         print(r'\hline', file=tex)
         print(r'\end{adjustbox}', file=tex)
         print(r'\vspace{0.1 mm}', file=tex)
@@ -136,6 +158,7 @@ def excel(self, excel_dict):
         excel_dict['Average Depth'] = f'{self.ave_coverage:,.1f}'
         excel_dict['No Coverage Bases'] = f'{self.total_zero_coverage:,}'
         excel_dict['Percent Ref with Zero Coverage'] = f'{self.percent_ref_with_zero_coverage:,.6f}%'
+        excel_dict['Ambiguous SNPs'] = f'{self.ac1_count:,}'
         excel_dict['Quality SNPs'] = f'{self.good_snp_count:,}'