Merge pull request #22 from bailey-lab/samplesheet

Update to sample sheet preparation addressing some issues arising fro…
bailey-lab · Feb 21, 2022 · 2306380 · 2306380
2 parents 3c0d3a1 + 776e355
commit 2306380
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 202 deletions.
diff --git a/MIPTools.def b/MIPTools.def
@@ -240,7 +240,7 @@ From: amd64/ubuntu:20.04
 
     help() {
         echo "Open an interactive Jupyter Notebook. The notebook can be used"
-        echo "for post-wrangler mapping and variant calling." 
+        echo "for post-wrangler mapping and variant calling."
         echo ""
         echo "Usage:"
         echo "  singularity run [options] --app jupyter <container>"\
@@ -288,7 +288,7 @@ From: amd64/ubuntu:20.04
     rsync /opt/resources/*.ipynb /opt/analysis --ignore-existing \
         --ignore-missing-args
 
-    # Inform the user how to access the notebook 
+    # Inform the user how to access the notebook
     echo "Use the following command if you are running this notebook from a"
     echo "remote server. Ignore if using a local computer."
     echo "ssh -f -N -L localhost:${nb_port}:${server_ip}:${nb_port}"\
@@ -350,7 +350,7 @@ From: amd64/ubuntu:20.04
         echo "    -m min_capture_length -p probe_sets -s sample_sets \\"
         echo "    -x stitch_options -k"
     }
-    
+
     # Set defaults
     cluster_script="runMIPWranglerCurrent.sh"
     server_number=1
@@ -446,14 +446,14 @@ From: amd64/ubuntu:20.04
     echo "It is recommended to run this app in a screen (GNU screen)."
     echo "A message indicating the end of download will be printed when done."
     echo "Check nohup.out file in your output directory for the download log."
-    
+
     # cd and run app
     # Use nohup to make command keep running even if get hangup signal
     cd /opt/analysis
     nohup python /opt/bin/BaseSpaceRunDownloader_v2.py \
      -r ${run_id} -a "$(cat /opt/resources/access_token.txt)"
 
-    # Print to CLI 
+    # Print to CLI
     echo "Download finished."
 
 #################################################################
@@ -464,7 +464,7 @@ From: amd64/ubuntu:20.04
     set -eu
 
     help() {
-        echo "Demultiplex data. Generates per-sample fastq files from the raw" 
+        echo "Demultiplex data. Generates per-sample fastq files from the raw"
         echo "sequence data consisting of bcl files."
         echo ""
         echo "Usage:"
@@ -475,71 +475,49 @@ From: amd64/ubuntu:20.04
         echo ""
         echo "App Options:"
         echo "  -h    Print the help page."
-        echo "  -p    Required. The sequencing platform used. Either 'miseq'"
-        echo "        or 'nextseq'."
-        echo "  -s    Required. The list of samples. Contains the samples used"
-        echo "        in the study, the primers used, etc. This file must be "
-        echo "        present in the directory mounted to '/opt/analysis'."
+        echo "  -s    Required. Sample sheet for demultiplexing. "
+        echo "        This file must be present in the directory mounted to "
+        echo "        '/opt/analysis'."
         echo ""
         echo "Examples:"
         echo "  # Set paths"
         echo "  $ resource_dir=/bin/MIPTools/base_resources"
         echo "  $ bcl_dir=/work/usr/downloaded"
-        echo "  $ fastq_dir=/work/usr/fastq"
+        echo "  $ fastq_root_dir=/work/usr/"
         echo ""
         echo "  # Run app"
         echo "  $ singularity run \\"
         echo "    -B \${resource_dir}:/opt/resources \\"
         echo "    -B \${bcl_dir}:/opt/data \\"
-        echo "    -B \${fastq_dir}:/opt/analysis \\"
-        echo "    --app demux <container> -s sample_list.tsv -p nextseq"
+        echo "    -B \${fastq_root_dir}:/opt/analysis \\"
+        echo "    --app demux <container> -s SampleSheet.csv"
     }
 
-    while getopts "hp:s:" opt; do
+    while getopts "hs:" opt; do
         case ${opt} in
             h) help
                exit 1 ;;
-            p) platform=${OPTARG} ;;
             s) sample_list=${OPTARG} ;;
             *) help
                exit 1 ;;
         esac
     done
 
     # Define variables
-    cd /opt/src
-    template_dir="/opt/resources/templates/sample_sheet_templates/"
-    platform_template="${platform}"_sample_sheet_template.csv
-    template="${template_dir}${platform_template}"
-    bc_dict="/opt/resources/barcode_dict.json"
-    output_dir="/opt/analysis"
-    sample_list="/opt/analysis/${sample_list}"
-
-    # Create a sample sheet for demultiplexing
-    python -c 'import mip_functions as mip; mip.generate_sample_sheet(
-        "'"${sample_list}"'", 
-        "'"${bc_dict}"'", 
-        "'"${template}"'", 
-        "'"${platform}"'", 
-        "'"${output_dir}"'"
-    )'
-
+    sample_sheet="/opt/analysis/${sample_list}"
     # cd to where bcl files are
     cd /opt/data
-    
+
     # Create a fastq directory for saving fastqs
     mkdir -p /opt/analysis/fastq
-
-    # Copy sample list to fastq directory
-    scp ${sample_list} /opt/analysis/fastq/
-
+
     # Increase limit of open number of files.
     ulimit -Sn $(ulimit -Hn)
-    
+
     # Run bcl2fastq
     # Use nohup to make command keep running even if get hangup signal
     nohup bcl2fastq -o /opt/analysis/fastq \
-        --sample-sheet /opt/analysis/SampleSheet.csv \
+        --sample-sheet ${sample_sheet} \
         --no-lane-splitting
 
 ##################################################################

diff --git a/base_resources/barcode_dict.json → ...resources/sample_prep/barcode_dict.pickle b/base_resources/barcode_dict.json → ...resources/sample_prep/barcode_dict.pickle
diff --git a/src/demux_qc.py b/src/demux_qc.py
@@ -7,7 +7,7 @@
 
 def main(platform, stats_dir):
     """Generate demultiplexing statistics after a sequencing run."""
-    bc_dict = "/opt/resources/barcode_dict.json"
+    bc_dict = "/opt/resources/sample_prep/barcode_dict.pickle"
 
     # load barcode dict to be passed to the header-primer conversion function
     with open(bc_dict, "rb") as infile:

diff --git a/src/mip_functions.py b/src/mip_functions.py
@@ -9695,105 +9695,6 @@ def save_fasta_dict(fasta_dict, fasta_file, linewidth=60):
                 outfile.write(fasta_seq[i: i + linewidth] + "\n")
 
 
-def generate_sample_sheet(sample_list_file,
-                          barcode_dict_file,
-                          sample_sheet_template,
-                          platform,
-                          output_dir,
-                          warnings=False):
-    """Create a sample sheet to be used by bcl2fasq file from sample list."""
-    with open(barcode_dict_file, "rb") as in1:
-        barcode_dic = pickle.load(in1)
-    # read in sample information
-    sample_names = []
-    sample_info = {}
-    with open(sample_list_file) as infile:
-        linenum = 0
-        for line in infile:
-            newline = line.strip().split("\t")
-            # first line is the header with column names
-            if linenum == 0:
-                colnames = newline
-                linenum += 1
-            else:
-                sample_dict = {colname: colvalue for colname, colvalue
-                               in zip(colnames, newline)}
-                sample_set = sample_dict["sample_set"]
-                sample_name = sample_dict["sample_name"]
-                replicate_number = sample_dict["replicate"]
-                forward_index = sample_dict["fw"]
-                reverse_index = sample_dict["rev"]
-                sample_id = "-".join([sample_name,
-                                      sample_set,
-                                      replicate_number])
-                if sample_id in sample_info:
-                    print("Repeating sample name ", sample_id)
-                if not sample_id.replace("-", "").isalnum():
-                    print(("Sample IDs can only contain "
-                           "alphanumeric characters and '-'. "
-                           "{} has invalid characters.").format(sample_id))
-                    continue
-                # nextseq and miseq barcodes are handled differently
-                if platform == "nextseq":
-                    sample_dict.update(
-                        {"i7": barcode_dic[reverse_index]["index_sequence"],
-                         "i5": barcode_dic[forward_index]["index_sequence"]})
-                elif platform == "miseq":
-                    sample_dict.update(
-                        {"i7": barcode_dic[reverse_index]["index_sequence"],
-                         "i5": barcode_dic[forward_index]["sequence"]})
-                sample_dict["sample_index"] = linenum
-                linenum += 1
-                sample_info[sample_id] = sample_dict
-                sample_names.append(sample_id)
-    # Check for samples sharing one or both barcodes. One barcode sharing is
-    # allowed but a warning can be printed if desired by setting the warning
-    #  to True. If both barcodes are shared among two samples, those samples
-    # will be ignored and a message will be broadcast.
-    samples_sharing = []
-    for s1 in sample_info:
-        for s2 in sample_info:
-            if s1 != s2:
-                if ((sample_info[s1]["fw"] == sample_info[s2]["fw"])
-                   and (sample_info[s1]["rev"] == sample_info[s2]["rev"])):
-                    samples_sharing.append([s1, s2])
-                elif warnings and (
-                    (sample_info[s1]["fw"] == sample_info[s2]["fw"])
-                    or (sample_info[s1]["rev"] == sample_info[s2]["rev"])
-                ):
-                    print("Samples %s and %s share a barcode" % (s1, s2))
-    samples_sharing_set = []
-    if len(samples_sharing) > 0:
-        for s in samples_sharing:
-            samples_sharing_set.extend(s)
-        samples_sharing_set = set(samples_sharing_set)
-        print("There are %d samples sharing the same barcode pair"
-              % len(samples_sharing_set))
-        pd.DataFrame(samples_sharing).to_csv(
-            os.path.join(output_dir, "samples_sharing_barcodes.tsv"),
-            sep="\t"
-        )
-    # create sample sheet
-    sample_sheet = os.path.join(output_dir, "SampleSheet.csv")
-    with open(sample_sheet_template) as infile, \
-            open(sample_sheet, "w") as outfile:
-        outfile_list = infile.readlines()
-        outfile_list = [o.strip() for o in outfile_list]
-        for sample_id in sample_names:
-            if sample_id in samples_sharing_set:
-                continue
-            reverse_index = sample_info[sample_id]["rev"]
-            forward_index = sample_info[sample_id]["fw"]
-            sample_index = str(sample_info[sample_id]["sample_index"])
-            outlist = [sample_index, sample_id, "", "",
-                       "S" + reverse_index,
-                       sample_info[sample_id]["i7"],
-                       "N" + forward_index,
-                       sample_info[sample_id]["i5"], "", ""]
-            outfile_list.append(",".join(outlist))
-        outfile.write("\n".join(outfile_list))
-
-
 def chromosome_converter(chrom, from_malariagen):
     """ Convert plasmodium chromosome names from standard (chr1, etc) to
     malariagen names (Pf3d7...) and vice versa.