genome · seqfu · Aug 16, 2018 · Sep 21, 2018 · May 17, 2019 · May 17, 2019
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,35 @@
-FROM mgibio/samtools:1.3.1
+FROM ubuntu:focal
+
+# tzdata config from https://stackoverflow.com/a/47909037
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+RUN apt-get update && \
+    # Required build tools  
+    apt-get install -y build-essential cmake && \
+    # git to fetch the repo and wget to populate vendor
+    apt-get install -y git wget && \
+    # Clean up
+    apt-get clean all && \
+    rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* && rm -rf /var/tmp/*
+
+RUN cd / && \
+    git clone https://github.com/genome/bam-readcount && \
+    cd bam-readcount && \
+    # For a specific tag enable the git checkout below
+    #git checkout cram-v0.0.2 && \
+    rm -rf build && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make
+
+
+# By adding zlib1g-dev don't appear to need mgibio/samtools:1.3.1 as a base
+FROM ubuntu:focal
 MAINTAINER John Garza <johnegarza@wustl.edu>
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
 
 LABEL \
     description="Image supporting the bam-readcount program"
@@ -10,25 +40,27 @@ RUN apt-get update && \
 	libcurl4-openssl-dev \
 	libssl-dev \
         patch \
-        python \
-        python-pip \
-        git
+        python-is-python3 \
+        python3 \
+        python3-pip \
+        git \
+        automake \
+        autoconf \
+        libbz2-dev \
+        liblzma-dev \
+        zlib1g-dev
 
 ENV SAMTOOLS_ROOT=/opt/samtools
-RUN mkdir /opt/bam-readcount
+RUN mkdir -p /opt/bam-readcount/bin
 
 WORKDIR /opt/bam-readcount
-RUN git clone https://github.com/genome/bam-readcount.git /tmp/bam-readcount-0.7.4 && \
-    git -C /tmp/bam-readcount-0.7.4 checkout v0.7.4 && \
-    cmake /tmp/bam-readcount-0.7.4 && \
-    make && \
-    rm -rf /tmp/bam-readcount-0.7.4 && \
-    ln -s /opt/bam-readcount/bin/bam-readcount /usr/bin/bam-readcount
+COPY --from=0 /bam-readcount/build/bin/bam-readcount /opt/bam-readcount/bin/bam-readcount
+RUN ln -s /opt/bam-readcount/bin/bam-readcount /usr/bin/bam-readcount
 
 COPY bam_readcount_helper.py /usr/bin/bam_readcount_helper.py
 
-RUN pip install --upgrade pip
-RUN pip install cyvcf2
+RUN pip3 install cyvcf2
+
 
 #clear inherited entrypoint
 ENTRYPOINT []
diff --git a/README.md b/README.md
@@ -0,0 +1,23 @@
+bam_readcount_helper-cwl
+========================
+
+Ubuntu Focal-based Docker image with `bam-readcount` and helper script
+
+
+Resources
+---------
+
+https://github.com/genome/bam-readcount  
+`bam-readcount` and documentation
+
+
+Usage
+-----
+
+`bam-readcount` and the helper script are installed as
+
+    /usr/bin/bam-readcount
+    /usr/bin/bam_readcount_helper.py
+
+
+
diff --git a/bam_readcount_helper.py b/bam_readcount_helper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import sys
 import os
@@ -16,13 +16,13 @@ def generate_region_list(hash):
     fh.close()
     return fh.name
 
-def filter_sites_in_hash(region_list, bam_file, ref_fasta, sample, output_dir, insertion_centric):
-    bam_readcount_cmd = ['/usr/bin/bam-readcount', '-f', ref_fasta, '-l', region_list, '-w', '0', '-b', '20']
+def filter_sites_in_hash(region_list, bam_file, ref_fasta, prefixed_sample, output_dir, insertion_centric, map_qual, base_qual):
+    bam_readcount_cmd = ['/usr/bin/bam-readcount', '-f', ref_fasta, '-l', region_list, '-w', '0', '-b', str(base_qual), '-q', str(map_qual)]
     if insertion_centric:
         bam_readcount_cmd.append('-i')
-        output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv')
+        output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_indel.tsv')
     else:
-        output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv')
+        output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_snv.tsv')
     bam_readcount_cmd.append(bam_file)
     execution = Popen(bam_readcount_cmd, stdout=PIPE, stderr=PIPE)
     stdout, stderr = execution.communicate()
@@ -32,70 +32,75 @@ def filter_sites_in_hash(region_list, bam_file, ref_fasta, sample, output_dir, i
     else:
         sys.exit(stderr)
 
+#initializing these with default values
+min_base_qual = 20
+min_mapping_qual = 0
 
-(script_name, vcf_filename, sample, ref_fasta, bam_file, output_dir)= sys.argv
+if len(sys.argv) == 7:
+    (script_name, vcf_filename, sample, ref_fasta, bam_file, prefix, output_dir)= sys.argv
+elif len(sys.argv) == 8:
+    (script_name, vcf_filename, sample, ref_fasta, bam_file, prefix, output_dir, min_base_qual)= sys.argv
+elif len(sys.argv) == 9: #elif instead of else for explicit safety
+    (script_name, vcf_filename, sample, ref_fasta, bam_file, prefix, output_dir, min_base_qual, min_mapping_qual)= sys.argv
+
+if prefix == 'NOPREFIX':
+    prefixed_sample = sample
+else:
+    prefixed_sample = '_'.join([prefix, sample])
 
 vcf_file = VCF(vcf_filename)
 sample_index = vcf_file.samples.index(sample)
 
 rc_for_indel = {}
 rc_for_snp   = {}
 for variant in vcf_file:
-    alleles = [variant.REF] + variant.ALT
-    genotype = variant.genotypes[sample_index]
-    gt_alt_alleles = [allele for allele in genotype if allele > 0]
-    used_gt_alt_alleles = [alleles[index] for index in gt_alt_alleles]
-    if len(used_gt_alt_alleles) > 0:
-        var = sorted(used_gt_alt_alleles)[0]
-    else:
-        var = ""
     ref = variant.REF
     chr = variant.CHROM
     start = variant.start
     end = variant.end
     pos = variant.POS
-
-    if len(ref) > 1 or len(var) > 1:
-        #it's an indel or mnp
-        if len(ref) == len(var) or (len(ref) > 1 and len(var) > 1):
-            sys.stderr.write("Complex variant or MNP will be skipped: %s\t%s\t%s\t%s\n" % (chr, pos, ref , var))
-            continue
-        elif len(ref) > len(var):
-            #it's a deletion
-            pos += 1
-            unmodified_ref = ref
-            ref = unmodified_ref[1]
-            var = "-%s" % unmodified_ref[1:]
+    for var in  variant.ALT:
+        if len(ref) > 1 or len(var) > 1:
+            #it's an indel or mnp
+            if len(ref) == len(var) or (len(ref) > 1 and len(var) > 1):
+                sys.stderr.write("Complex variant or MNP will be skipped: %s\t%s\t%s\t%s\n" % (chr, pos, ref , var))
+                continue
+            elif len(ref) > len(var):
+                #it's a deletion
+                pos += 1
+                unmodified_ref = ref
+                ref = unmodified_ref[1]
+                var = "-%s" % unmodified_ref[1:]
+            else:
+                #it's an insertion
+                var = "+%s" % var[1:]
+            if chr not in rc_for_indel:
+                rc_for_indel[chr] = {}
+            if pos not in rc_for_indel[chr]:
+                rc_for_indel[chr][pos] = {}
+            if ref not in rc_for_indel[chr][pos]:
+                rc_for_indel[chr][pos][ref] = {}
+            rc_for_indel[chr][pos][ref] = variant
         else:
-            #it's an insertion
-            var = "+%s" % var[1:]
-        if chr not in rc_for_indel:
-            rc_for_indel[chr] = {}
-        if pos not in rc_for_indel[chr]:
-            rc_for_indel[chr][pos] = {}
-        if ref not in rc_for_indel[chr][pos]:
-            rc_for_indel[chr][pos][ref] = {}
-        rc_for_indel[chr][pos][ref] = variant
-    else:
-        #it's a SNP
-        if chr not in rc_for_snp:
-            rc_for_snp[chr] = {}
-        if pos not in rc_for_snp[chr]:
-            rc_for_snp[chr][pos] = {}
-        if ref not in rc_for_snp[chr][pos]:
-            rc_for_snp[chr][pos][ref] = {}
-        rc_for_snp[chr][pos][ref] = variant
+            #it's a SNP
+            if chr not in rc_for_snp:
+                rc_for_snp[chr] = {}
+            if pos not in rc_for_snp[chr]:
+                rc_for_snp[chr][pos] = {}
+            if ref not in rc_for_snp[chr][pos]:
+                rc_for_snp[chr][pos][ref] = {}
+            rc_for_snp[chr][pos][ref] = variant
 
 if len(rc_for_snp.keys()) > 0:
     region_file = generate_region_list(rc_for_snp)
-    filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, False)
+    filter_sites_in_hash(region_file, bam_file, ref_fasta, prefixed_sample, output_dir, False, min_mapping_qual, min_base_qual)
 else:
-    output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv')
+    output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_snv.tsv')
     open(output_file, 'w').close()
 
 if len(rc_for_indel.keys()) > 0:
     region_file = generate_region_list(rc_for_indel)
-    filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, True)
+    filter_sites_in_hash(region_file, bam_file, ref_fasta, prefixed_sample, output_dir, True, min_mapping_qual, min_base_qual)
 else:
-    output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv')
+    output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_indel.tsv')
     open(output_file, 'w').close()