Skip to content

Commit

Permalink
Merge pull request #22 from bailey-lab/samplesheet
Browse files Browse the repository at this point in the history
Update to sample sheet preparation addressing some issues arising fro…
  • Loading branch information
JeffAndBailey authored Feb 21, 2022
2 parents 3c0d3a1 + 776e355 commit 2306380
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 202 deletions.
58 changes: 18 additions & 40 deletions MIPTools.def
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ From: amd64/ubuntu:20.04

help() {
echo "Open an interactive Jupyter Notebook. The notebook can be used"
echo "for post-wrangler mapping and variant calling."
echo "for post-wrangler mapping and variant calling."
echo ""
echo "Usage:"
echo " singularity run [options] --app jupyter <container>"\
Expand Down Expand Up @@ -288,7 +288,7 @@ From: amd64/ubuntu:20.04
rsync /opt/resources/*.ipynb /opt/analysis --ignore-existing \
--ignore-missing-args

# Inform the user how to access the notebook
# Inform the user how to access the notebook
echo "Use the following command if you are running this notebook from a"
echo "remote server. Ignore if using a local computer."
echo "ssh -f -N -L localhost:${nb_port}:${server_ip}:${nb_port}"\
Expand Down Expand Up @@ -350,7 +350,7 @@ From: amd64/ubuntu:20.04
echo " -m min_capture_length -p probe_sets -s sample_sets \\"
echo " -x stitch_options -k"
}

# Set defaults
cluster_script="runMIPWranglerCurrent.sh"
server_number=1
Expand Down Expand Up @@ -446,14 +446,14 @@ From: amd64/ubuntu:20.04
echo "It is recommended to run this app in a screen (GNU screen)."
echo "A message indicating the end of download will be printed when done."
echo "Check nohup.out file in your output directory for the download log."

# cd and run app
# Use nohup to make command keep running even if get hangup signal
cd /opt/analysis
nohup python /opt/bin/BaseSpaceRunDownloader_v2.py \
-r ${run_id} -a "$(cat /opt/resources/access_token.txt)"

# Print to CLI
# Print to CLI
echo "Download finished."

#################################################################
Expand All @@ -464,7 +464,7 @@ From: amd64/ubuntu:20.04
set -eu

help() {
echo "Demultiplex data. Generates per-sample fastq files from the raw"
echo "Demultiplex data. Generates per-sample fastq files from the raw"
echo "sequence data consisting of bcl files."
echo ""
echo "Usage:"
Expand All @@ -475,71 +475,49 @@ From: amd64/ubuntu:20.04
echo ""
echo "App Options:"
echo " -h Print the help page."
echo " -p Required. The sequencing platform used. Either 'miseq'"
echo " or 'nextseq'."
echo " -s Required. The list of samples. Contains the samples used"
echo " in the study, the primers used, etc. This file must be "
echo " present in the directory mounted to '/opt/analysis'."
echo " -s Required. Sample sheet for demultiplexing. "
echo " This file must be present in the directory mounted to "
echo " '/opt/analysis'."
echo ""
echo "Examples:"
echo " # Set paths"
echo " $ resource_dir=/bin/MIPTools/base_resources"
echo " $ bcl_dir=/work/usr/downloaded"
echo " $ fastq_dir=/work/usr/fastq"
echo " $ fastq_root_dir=/work/usr/"
echo ""
echo " # Run app"
echo " $ singularity run \\"
echo " -B \${resource_dir}:/opt/resources \\"
echo " -B \${bcl_dir}:/opt/data \\"
echo " -B \${fastq_dir}:/opt/analysis \\"
echo " --app demux <container> -s sample_list.tsv -p nextseq"
echo " -B \${fastq_root_dir}:/opt/analysis \\"
echo " --app demux <container> -s SampleSheet.csv"
}

while getopts "hp:s:" opt; do
while getopts "hs:" opt; do
case ${opt} in
h) help
exit 1 ;;
p) platform=${OPTARG} ;;
s) sample_list=${OPTARG} ;;
*) help
exit 1 ;;
esac
done

# Define variables
cd /opt/src
template_dir="/opt/resources/templates/sample_sheet_templates/"
platform_template="${platform}"_sample_sheet_template.csv
template="${template_dir}${platform_template}"
bc_dict="/opt/resources/barcode_dict.json"
output_dir="/opt/analysis"
sample_list="/opt/analysis/${sample_list}"

# Create a sample sheet for demultiplexing
python -c 'import mip_functions as mip; mip.generate_sample_sheet(
"'"${sample_list}"'",
"'"${bc_dict}"'",
"'"${template}"'",
"'"${platform}"'",
"'"${output_dir}"'"
)'

sample_sheet="/opt/analysis/${sample_list}"
# cd to where bcl files are
cd /opt/data

# Create a fastq directory for saving fastqs
mkdir -p /opt/analysis/fastq

# Copy sample list to fastq directory
scp ${sample_list} /opt/analysis/fastq/


# Increase limit of open number of files.
ulimit -Sn $(ulimit -Hn)

# Run bcl2fastq
# Use nohup to make command keep running even if get hangup signal
nohup bcl2fastq -o /opt/analysis/fastq \
--sample-sheet /opt/analysis/SampleSheet.csv \
--sample-sheet ${sample_sheet} \
--no-lane-splitting

##################################################################
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/demux_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def main(platform, stats_dir):
"""Generate demultiplexing statistics after a sequencing run."""
bc_dict = "/opt/resources/barcode_dict.json"
bc_dict = "/opt/resources/sample_prep/barcode_dict.pickle"

# load barcode dict to be passed to the header-primer conversion function
with open(bc_dict, "rb") as infile:
Expand Down
99 changes: 0 additions & 99 deletions src/mip_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9695,105 +9695,6 @@ def save_fasta_dict(fasta_dict, fasta_file, linewidth=60):
outfile.write(fasta_seq[i: i + linewidth] + "\n")


def generate_sample_sheet(sample_list_file,
barcode_dict_file,
sample_sheet_template,
platform,
output_dir,
warnings=False):
"""Create a sample sheet to be used by bcl2fasq file from sample list."""
with open(barcode_dict_file, "rb") as in1:
barcode_dic = pickle.load(in1)
# read in sample information
sample_names = []
sample_info = {}
with open(sample_list_file) as infile:
linenum = 0
for line in infile:
newline = line.strip().split("\t")
# first line is the header with column names
if linenum == 0:
colnames = newline
linenum += 1
else:
sample_dict = {colname: colvalue for colname, colvalue
in zip(colnames, newline)}
sample_set = sample_dict["sample_set"]
sample_name = sample_dict["sample_name"]
replicate_number = sample_dict["replicate"]
forward_index = sample_dict["fw"]
reverse_index = sample_dict["rev"]
sample_id = "-".join([sample_name,
sample_set,
replicate_number])
if sample_id in sample_info:
print("Repeating sample name ", sample_id)
if not sample_id.replace("-", "").isalnum():
print(("Sample IDs can only contain "
"alphanumeric characters and '-'. "
"{} has invalid characters.").format(sample_id))
continue
# nextseq and miseq barcodes are handled differently
if platform == "nextseq":
sample_dict.update(
{"i7": barcode_dic[reverse_index]["index_sequence"],
"i5": barcode_dic[forward_index]["index_sequence"]})
elif platform == "miseq":
sample_dict.update(
{"i7": barcode_dic[reverse_index]["index_sequence"],
"i5": barcode_dic[forward_index]["sequence"]})
sample_dict["sample_index"] = linenum
linenum += 1
sample_info[sample_id] = sample_dict
sample_names.append(sample_id)
# Check for samples sharing one or both barcodes. One barcode sharing is
# allowed but a warning can be printed if desired by setting the warning
# to True. If both barcodes are shared among two samples, those samples
# will be ignored and a message will be broadcast.
samples_sharing = []
for s1 in sample_info:
for s2 in sample_info:
if s1 != s2:
if ((sample_info[s1]["fw"] == sample_info[s2]["fw"])
and (sample_info[s1]["rev"] == sample_info[s2]["rev"])):
samples_sharing.append([s1, s2])
elif warnings and (
(sample_info[s1]["fw"] == sample_info[s2]["fw"])
or (sample_info[s1]["rev"] == sample_info[s2]["rev"])
):
print("Samples %s and %s share a barcode" % (s1, s2))
samples_sharing_set = []
if len(samples_sharing) > 0:
for s in samples_sharing:
samples_sharing_set.extend(s)
samples_sharing_set = set(samples_sharing_set)
print("There are %d samples sharing the same barcode pair"
% len(samples_sharing_set))
pd.DataFrame(samples_sharing).to_csv(
os.path.join(output_dir, "samples_sharing_barcodes.tsv"),
sep="\t"
)
# create sample sheet
sample_sheet = os.path.join(output_dir, "SampleSheet.csv")
with open(sample_sheet_template) as infile, \
open(sample_sheet, "w") as outfile:
outfile_list = infile.readlines()
outfile_list = [o.strip() for o in outfile_list]
for sample_id in sample_names:
if sample_id in samples_sharing_set:
continue
reverse_index = sample_info[sample_id]["rev"]
forward_index = sample_info[sample_id]["fw"]
sample_index = str(sample_info[sample_id]["sample_index"])
outlist = [sample_index, sample_id, "", "",
"S" + reverse_index,
sample_info[sample_id]["i7"],
"N" + forward_index,
sample_info[sample_id]["i5"], "", ""]
outfile_list.append(",".join(outlist))
outfile.write("\n".join(outfile_list))


def chromosome_converter(chrom, from_malariagen):
""" Convert plasmodium chromosome names from standard (chr1, etc) to
malariagen names (Pf3d7...) and vice versa.
Expand Down
Loading

0 comments on commit 2306380

Please sign in to comment.