From c516897d45dcaa5a84353c463405523d20bee715 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Sun, 18 Jan 2015 07:37:33 -0500 Subject: [PATCH 01/12] clean up some old-scripts that have been implemented already [ci skip] --- old-scripts/kga/assembly_pipeline_v1.sh | 187 ----------------- old-scripts/kga/ebov_assembly.sh | 256 ------------------------ 2 files changed, 443 deletions(-) delete mode 100755 old-scripts/kga/ebov_assembly.sh diff --git a/old-scripts/kga/assembly_pipeline_v1.sh b/old-scripts/kga/assembly_pipeline_v1.sh index 9507877dd..172421238 100644 --- a/old-scripts/kga/assembly_pipeline_v1.sh +++ b/old-scripts/kga/assembly_pipeline_v1.sh @@ -1,57 +1,3 @@ -## note: this is tailored to Lassa, see ebov_assembly.sh for Ebola-specific tweaks - -#-------- VIRAL GENOME ASSEMBLY @ BROAD --------# -use BLAST+ -use Perl-5.10 -use Samtools -use Python-2.7 -use BWA -use BamTools -use Java-1.7 - -# MAKE REQUIRED SUB-DIRECTORIES -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o ~/log.txt -P sabeti_align -J Directories "mkdir $directory/_logs $directory/_temp $directory/_reads $directory/_bams $directory/_reports $directory/_pileup $directory/_meta $directory/_refs" -done - -# CONVERT BAM FILES TO FASTQ -for sample in -do -for directory in -do -for suffix in sorted.bam -do -for in_url in /idi/sabeti-data/kandersen/analysis/140416_automation-samples/final_files -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.fq "java -Xmx2g -jar /seq/software/picard/current/bin/SamToFastq.jar INPUT=$in_url/$sample.$suffix FASTQ=$directory/_reads/$sample.reads1.fastq SECOND_END_FASTQ=$directory/_reads/$sample.reads2.fastq VALIDATION_STRINGENCY=SILENT" -done -done -done -done - -#-------- PERFORM DE NOVO ASSEMBLY OF LASSA READS USING TRINITY --------# -# TRIM THE READS WITH TRIMMOMATIC -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=4]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.tr "java -Xmx2g -classpath /idi/sabeti-scratch/kandersen/bin/trimmomatic/trimmomatic-0.32.jar org.usadellab.trimmomatic.TrimmomaticPE $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq $directory/_reads/$sample.trimmed.1.fastq $directory/_temp/$sample.reads1.trimmed_unpaired.fastq $directory/_reads/$sample.trimmed.2.fastq $directory/_temp/$sample.reads2.trimmed_unpaired.fastq LEADING:20 TRAILING:20 SLIDINGWINDOW:4:25 MINLEN:30 ILLUMINACLIP:/idi/sabeti-scratch/kandersen/references/contaminants/contaminants.fasta:2:30:12 && rm $directory/_temp/$sample.reads?.trimmed_unpaired.fastq" -done -done - -# LASTAL ALIGNMENT TO FIND RELEVANT READS -for sample in -do -for directory in -do -for database in arena -do -bsub -n 1 -R "span[hosts=1]" -q week -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.l1 "/idi/sabeti-scratch/kandersen/bin/last/lastal -Q1 /idi/sabeti-scratch/kandersen/references/lastal/$database $directory/_reads/$sample.trimmed.1.fastq | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-sort.sh -n2 | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-convert.py tab /dev/stdin > $directory/_temp/$sample.reads1.lastal.txt && python /idi/sabeti-scratch/kandersen/bin/scripts/noBlastLikeHits.py -b $directory/_temp/$sample.reads1.lastal.txt -r $directory/_reads/$sample.trimmed.1.fastq -m hit | perl /idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -ns_max_n 1 -derep 1 -fastq stdin -out_bad null -line_width 0 -out_good $directory/_temp/$sample.lastal.1 && rm $directory/_temp/$sample.reads1.lastal.txt" -bsub -n 1 -R "span[hosts=1]" -q week -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.l2 "/idi/sabeti-scratch/kandersen/bin/last/lastal -Q1 /idi/sabeti-scratch/kandersen/references/lastal/$database $directory/_reads/$sample.trimmed.2.fastq | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-sort.sh -n2 | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-convert.py tab /dev/stdin > $directory/_temp/$sample.reads2.lastal.txt && python /idi/sabeti-scratch/kandersen/bin/scripts/noBlastLikeHits.py -b $directory/_temp/$sample.reads2.lastal.txt -r $directory/_reads/$sample.trimmed.2.fastq -m hit | perl /idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -ns_max_n 1 -derep 1 -fastq stdin -out_bad null -line_width 0 -out_good $directory/_temp/$sample.lastal.2 && rm $directory/_temp/$sample.reads2.lastal.txt" -done -done -done # DE NOVO ASSEMBLY USING TRINITY use Java-1.6 @@ -86,7 +32,6 @@ bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt done done -#-------- MAP LASSA READS TO MODIFIED CONTIGS USING NOVOALIGN --------# # ALIGN REFERENCES AND CONTIGS THAT PASSED LENGTH FILTER for sample in do @@ -116,95 +61,9 @@ done done done -# INDEX MODIFIED CONTIGS -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.i1 "java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment2.fasta O=$directory/_temp/$sample.s_segment2.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_temp/$sample.s_segment2.nix $directory/_temp/$sample.s_segment2.fasta && samtools faidx $directory/_temp/$sample.s_segment2.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment2a.fasta O=$directory/_temp/$sample.s_segment2a.dict && samtools faidx $directory/_temp/$sample.s_segment2a.fasta" -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.i2 "java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.l_segment2.fasta O=$directory/_temp/$sample.l_segment2.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_temp/$sample.l_segment2.nix $directory/_temp/$sample.l_segment2.fasta && samtools faidx $directory/_temp/$sample.l_segment2.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.l_segment2a.fasta O=$directory/_temp/$sample.l_segment2a.dict && samtools faidx $directory/_temp/$sample.l_segment2a.fasta" -done -done - -# NOVOALIGN LASSA READS TO MODIFIED CONTIGS -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.a1 "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_temp/$sample.clean.1.fastq $directory/_temp/$sample.clean.2.fastq -r Random -l 30 -g 40 -x 20 -t 502 -F STDFQ -d $directory/_temp/$sample.s_segment2.nix -o SAM $'@RG\tID:$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' | samtools view -buS -q 1 - | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.ref_mapped_s1.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_temp/$sample.s_segment2a.fasta -I $directory/_temp/$sample.ref_mapped_s1.bam -o $directory/_temp/$sample.gatk_s1.vcf --baq OFF --useOriginalQualities -out_mode EMIT_ALL_SITES -dt NONE --num_threads 1 --min_base_quality_score 15 -ploidy 4 -stand_call_conf 0 -stand_emit_conf 0 -A AlleleBalance && python ~dpark/dev/sabetilab-dpark/scripts/viral.py vcf_to_fasta --trim_ends --min_coverage 2 $directory/_temp/$sample.gatk_s1.vcf $directory/_temp/$sample.s_segment3.fasta && python ~dpark/dev/sabetilab-dpark/scripts/viral.py deambig_fasta $directory/_temp/$sample.s_segment3.fasta $directory/_temp/$sample.s_segment3a.fasta" -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.a2 "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_temp/$sample.clean.1.fastq $directory/_temp/$sample.clean.2.fastq -r Random -l 30 -g 40 -x 20 -t 502 -F STDFQ -d $directory/_temp/$sample.l_segment2.nix -o SAM $'@RG\tID:$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' | samtools view -buS -q 1 - | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.ref_mapped_l1.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_temp/$sample.l_segment2a.fasta -I $directory/_temp/$sample.ref_mapped_l1.bam -o $directory/_temp/$sample.gatk_l1.vcf --baq OFF --useOriginalQualities -out_mode EMIT_ALL_SITES -dt NONE --num_threads 1 --min_base_quality_score 15 -ploidy 4 -stand_call_conf 0 -stand_emit_conf 0 -A AlleleBalance && python ~dpark/dev/sabetilab-dpark/scripts/viral.py vcf_to_fasta --trim_ends --min_coverage 2 $directory/_temp/$sample.gatk_l1.vcf $directory/_temp/$sample.l_segment3.fasta && python ~dpark/dev/sabetilab-dpark/scripts/viral.py deambig_fasta $directory/_temp/$sample.l_segment3.fasta $directory/_temp/$sample.l_segment3a.fasta" -done -done - -# INDEX CONSENSUS SEQUENCES -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.i1 "java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment3.fasta O=$directory/_temp/$sample.s_segment3.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_temp/$sample.s_segment3.nix $directory/_temp/$sample.s_segment3.fasta && samtools faidx $directory/_temp/$sample.s_segment3.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment3a.fasta O=$directory/_temp/$sample.s_segment3a.dict && samtools faidx $directory/_temp/$sample.s_segment3a.fasta" -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.i2 "java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.l_segment3.fasta O=$directory/_temp/$sample.l_segment3.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_temp/$sample.l_segment3.nix $directory/_temp/$sample.l_segment3.fasta && samtools faidx $directory/_temp/$sample.l_segment3.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.l_segment3a.fasta O=$directory/_temp/$sample.l_segment3a.dict && samtools faidx $directory/_temp/$sample.l_segment3a.fasta" -done -done - -# NOVOALIGN ALL READS TO CONSENSUS SEQUENCES -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.a1 "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -r Random -l 40 -g 40 -x 20 -t 100 -F STDFQ -d $directory/_temp/$sample.s_segment3.nix -o SAM $'@RG\tID:$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' | samtools view -buS -q 1 - | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.ref_mapped_s2.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_temp/$sample.s_segment3a.fasta -I $directory/_temp/$sample.ref_mapped_s2.bam -o $directory/_temp/$sample.gatk_s2.vcf --baq OFF --useOriginalQualities -out_mode EMIT_ALL_SITES -dt NONE --num_threads 1 --min_base_quality_score 15 -ploidy 4 -stand_call_conf 0 -stand_emit_conf 0 -A AlleleBalance && python ~dpark/dev/sabetilab-dpark/scripts/viral.py vcf_to_fasta --trim_ends --min_coverage 2 $directory/_temp/$sample.gatk_s2.vcf $directory/_temp/$sample.s_segment4.fasta" -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.a2 "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -r Random -l 40 -g 40 -x 20 -t 100 -F STDFQ -d $directory/_temp/$sample.l_segment3.nix -o SAM $'@RG\tID:$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' | samtools view -buS -q 1 - | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.ref_mapped_l2.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_temp/$sample.l_segment3a.fasta -I $directory/_temp/$sample.ref_mapped_l2.bam -o $directory/_temp/$sample.gatk_l2.vcf --baq OFF --useOriginalQualities -out_mode EMIT_ALL_SITES -dt NONE --num_threads 1 --min_base_quality_score 15 -ploidy 4 -stand_call_conf 0 -stand_emit_conf 0 -A AlleleBalance && python ~dpark/dev/sabetilab-dpark/scripts/viral.py vcf_to_fasta --trim_ends --min_coverage 2 $directory/_temp/$sample.gatk_l2.vcf $directory/_temp/$sample.l_segment4.fasta" -done -done - -#-------- PREPARE FINAL MAPPED ASSEMBLY USING NOVOALIGN --------# -# INDEX FINAL CONSENSUS SEQUENCES -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.ix "cat $directory/_temp/$sample.l_segment4.fasta $directory/_temp/$sample.s_segment4.fasta > $directory/_refs/$sample.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_refs/$sample.fasta O=$directory/_refs/$sample.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_refs/$sample.nix $directory/_refs/$sample.fasta && samtools faidx $directory/_refs/$sample.fasta" -done -done - -# ALIGN ALL READS TO ITS OWN LASSA CONSENSUS -for sample in -do -for directory in -do -for date in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 4 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.al "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -k -c 3 -f $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -r Random -l 40 -g 40 -x 20 -t 100 -F STDFQ -d $directory/_refs/$sample.nix -o SAM $'@RG\tID:$date.$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' 2> $directory/_logs/$sample.log.novoalign.txt | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_bams/$sample.sorted.bam CREATE_INDEX=true && samtools view -b -q 1 -u $directory/_bams/$sample.sorted.bam | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.mapped.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /seq/software/picard/current/bin/MarkDuplicates.jar I=$directory/_temp/$sample.mapped.bam O=$directory/_temp/$sample.mappedNoDub.bam METRICS_FILE=$directory/_temp/$sample.log.markdups.txt CREATE_INDEX=true REMOVE_DUPLICATES=true" -done -done -done -#-------- PREPARE ALIGNMENT AND CALCULATE METRICS --------# -# RUN FASTQC REPORT -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.dx "/idi/sabeti-scratch/kandersen/bin/fastqc/fastqc -f bam $directory/_bams/$sample.sorted.bam -o $directory/_reports/" -done -done -# PERFORM LOCAL REALIGNMENT -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.lr "java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T RealignerTargetCreator -R $directory/_refs/$sample.fasta -o $directory/_temp/$sample.log.realigner.intervals -I $directory/_temp/$sample.mappedNoDub.bam && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T IndelRealigner -R $directory/_refs/$sample.fasta -targetIntervals $directory/_temp/$sample.log.realigner.intervals -I $directory/_temp/$sample.mappedNoDub.bam -o $directory/_bams/$sample.realigned.bam" -done -done -# CALCULATE COVERAGE & SNPS -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.st "/idi/sabeti-scratch/kandersen/bin/bedtools/genomeCoverageBed -d -ibam $directory/_bams/$sample.realigned.bam -g $directory/_refs/$sample.fasta > $directory/_pileup/$sample.coverage.txt && bamtools stats -insert -in $directory/_bams/$sample.realigned.bam > $directory/_logs/$sample.log.bamstats_realigned.txt && java -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_refs/$sample.fasta -I $directory/_bams/$sample.realigned.bam -o $directory/_temp/$sample.snps.vcf --baq OFF --useOriginalQualities -out_mode EMIT_VARIANTS_ONLY -dt NONE --num_threads 1 --min_base_quality_score 20 -ploidy 10 -stand_call_conf 80.0 -stand_emit_conf 30.0 -A AlleleBalance" -done -done # FILTER SNPS for sample in @@ -224,49 +83,3 @@ bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt done done -#-------- SHORT META ANALYSIS @ BROAD --------# -# SUBSAMPLE READS FOR LATER RETRIEVAL -for sample in -do -for directory in -do -bsub -R "rusage[mem=6]" -W 4:00 -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.ss "python /idi/sabeti-scratch/kandersen/bin/scripts/subsampler.py -n 1000000 -mode p -in $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -out $directory/_temp/$sample.reads1.sub.fastq $directory/_temp/$sample.reads2.sub.fastq" -done -done - -# COUNT SPIKE-INS -for sample in -do -for directory in -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.si "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_temp/$sample.reads1.sub.fastq $directory/_temp/$sample.reads2.sub.fastq -r Random -F STDFQ -d /idi/sabeti-scratch/kandersen/references/other/ercc_spike-ins.nix -o SAM 2> $directory/_temp/$sample.log.spike_novo.txt | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.spikes.bam CREATE_INDEX=true && samtools view -b -q 1 -u $directory/_temp/$sample.spikes.bam | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.spikes_mapped.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && /idi/sabeti-scratch/kandersen/bin/scripts/CountAlignmentsByDescriptionLine -bam $directory/_temp/$sample.spikes_mapped.bam > $directory/_logs/$sample.log.spike_count.txt" -done -done - -# ALIGN TO HUMAN OR OTHER REFERENCE USING BWA -for sample in -do -for directory in -do -bsub -R "rusage[mem=8]" -W 4:00 -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.hg "perl /idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -lc_method dust -lc_threshold 7 -line_width 0 -fastq $directory/_temp/$sample.reads1.sub.fastq -out_good stdout -out_bad null | /idi/sabeti-scratch/kandersen/bin/bwa/bwa aln -q 5 -l 32 /idi/sabeti-scratch/kandersen/references/human/hg19 - | /idi/sabeti-scratch/kandersen/bin/bwa/bwa samse /idi/sabeti-scratch/kandersen/references/human/hg19 - $directory/_temp/$sample.reads1.sub.fastq | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=/dev/stdout VALIDATION_STRINGENCY=SILENT | samtools flagstat - > $directory/_logs/$sample.log.hg19.txt" -done -done - -# PERFORM BLASTN ANALYSIS -for sample in -do -for directory in -do -bsub -W 4:00 -R "rusage[mem=12]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.fa "/idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -min_len 40 -seq_num 10000 -out_format 1 -line_width 0 -fastq $directory/_temp/$sample.reads1.sub.fastq -out_good $directory/_temp/$sample -out_bad null && blastn -db /idi/sabeti-scratch/kandersen/references/blast/nt -outfmt 6 -evalue 1e-10 -num_descriptions 30 -query $directory/_temp/$sample.fasta -out $directory/_temp/$sample.blastn.txt" -done -done - -# CREATE MEGAN FILES -# You will have to be running an xhost in order to do this step: open /Applications/Utilities/X11.app/ xhost + and export DISPLAY=:0 -for sample in -do -for directory in -do -bsub -R "rusage[mem=4]" -W 4:00 -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.mg /idi/sabeti-scratch/kandersen/bin/megan/MEGAN -E +g -p "/idi/sabeti-scratch/kandersen/bin/megan/Megan.def" -x "load gi2taxfile="/idi/sabeti-scratch/kandersen/bin/megan/class/resources/files/gi_taxid_nucl.bin"; import blastfile="$directory/_temp/$sample.blastn.txt" fastafile="$directory/_temp/$sample.fasta" meganfile="$directory/_meta/$sample.rma" useseed=false usekegg=false; select nodes=all; uncollapse subtrees; update; exportimage file="$directory/_meta/$sample.pdf" format=pdf replace=true; quit;" -done -done \ No newline at end of file diff --git a/old-scripts/kga/ebov_assembly.sh b/old-scripts/kga/ebov_assembly.sh deleted file mode 100755 index 585e8c5da..000000000 --- a/old-scripts/kga/ebov_assembly.sh +++ /dev/null @@ -1,256 +0,0 @@ -#-------- VIRAL GENOME ASSEMBLY @ BROAD --------# -use BLAST+ -use Perl-5.10 -use Samtools -use Python-2.7 -use BWA -use BamTools -use Java-1.7 - -# MAKE REQUIRED SUB-DIRECTORIES -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o ~/log.txt -P sabeti_align -J Directories "mkdir $directory/_logs $directory/_temp $directory/_reads $directory/_bams $directory/_reports $directory/_pileup $directory/_meta $directory/_refs" -done - -# CONVERT BAM FILES TO FASTQ -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -for suffix in sorted.bam -do -for in_url in /idi/sabeti-data/kandersen/sequencing_storage/ebov/bam_sorted -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.fq "java -Xmx2g -jar /seq/software/picard/current/bin/SamToFastq.jar INPUT=$in_url/$sample.$suffix FASTQ=$directory/_reads/$sample.reads1.fastq SECOND_END_FASTQ=$directory/_reads/$sample.reads2.fastq VALIDATION_STRINGENCY=SILENT" -done -done -done -done - -#-------- PERFORM DE NOVO ASSEMBLY OF EBOV READS USING TRINITY --------# -# TRIM THE READS WITH TRIMMOMATIC -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=4]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.tr "java -Xmx2g -classpath /idi/sabeti-scratch/kandersen/bin/trimmomatic/trimmomatic-0.32.jar org.usadellab.trimmomatic.TrimmomaticPE $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq $directory/_reads/$sample.trimmed.1.fastq $directory/_temp/$sample.reads1.trimmed_unpaired.fastq $directory/_reads/$sample.trimmed.2.fastq $directory/_temp/$sample.reads2.trimmed_unpaired.fastq LEADING:20 TRAILING:20 SLIDINGWINDOW:4:25 MINLEN:30 ILLUMINACLIP:/idi/sabeti-scratch/kandersen/references/contaminants/contaminants.fasta:2:30:12 && rm $directory/_temp/$sample.reads?.trimmed_unpaired.fastq" -done -done - -# LASTAL ALIGNMENT TO FIND RELEVANT READS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -for database in ebola -do -bsub -n 1 -R "span[hosts=1]" -q week -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.l1 "/idi/sabeti-scratch/kandersen/bin/last/lastal -Q1 /idi/sabeti-scratch/kandersen/references/lastal/$database $directory/_reads/$sample.trimmed.1.fastq | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-sort.sh -n2 | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-convert.py tab /dev/stdin > $directory/_temp/$sample.reads1.lastal.txt && python /idi/sabeti-scratch/kandersen/bin/scripts/noBlastLikeHits.py -b $directory/_temp/$sample.reads1.lastal.txt -r $directory/_reads/$sample.trimmed.1.fastq -m hit | perl /idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -ns_max_n 1 -derep 1 -fastq stdin -out_bad null -line_width 0 -out_good $directory/_temp/$sample.lastal.1" -bsub -n 1 -R "span[hosts=1]" -q week -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.l2 "/idi/sabeti-scratch/kandersen/bin/last/lastal -Q1 /idi/sabeti-scratch/kandersen/references/lastal/$database $directory/_reads/$sample.trimmed.2.fastq | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-sort.sh -n2 | /idi/sabeti-scratch/kandersen/bin/last/scripts/maf-convert.py tab /dev/stdin > $directory/_temp/$sample.reads2.lastal.txt && python /idi/sabeti-scratch/kandersen/bin/scripts/noBlastLikeHits.py -b $directory/_temp/$sample.reads2.lastal.txt -r $directory/_reads/$sample.trimmed.2.fastq -m hit | perl /idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -ns_max_n 1 -derep 1 -fastq stdin -out_bad null -line_width 0 -out_good $directory/_temp/$sample.lastal.2" -done -done -done - -# DE NOVO ASSEMBLY USING TRINITY -use Java-1.6 -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -q hour -W 4:00 -o $directory/_logs/$sample.log.denovo.txt -P sabeti_meta -J $sample.dn "/idi/sabeti-scratch/kandersen/bin/scripts/mergeShuffledFastqSeqs.pl -t -r '^@(\S+)/[1|2]$' -f1 $directory/_temp/$sample.lastal.1.fastq -f2 $directory/_temp/$sample.lastal.2.fastq -o $directory/_temp/$sample.clean && /idi/sabeti-scratch/kandersen/bin/scripts/subsampler.py -n 100000 -mode p -in $directory/_temp/$sample.clean.1.fastq $directory/_temp/$sample.clean.2.fastq -out $directory/_temp/$sample.reads1.sub.fastq $directory/_temp/$sample.reads2.sub.fastq && wc -l $directory/_temp/$sample.clean.?.fastq > $directory/_logs/$sample.log.lastal.txt && perl /idi/sabeti-scratch/kandersen/bin/trinity_old/Trinity.pl --CPU 1 --min_contig_length 300 --seqType fq --left $directory/_temp/$sample.reads1.sub.fastq --right $directory/_temp/$sample.reads2.sub.fastq --output $directory/_temp/$sample.trinity && mv $directory/_temp/$sample.trinity/Trinity.fasta $directory/_pileup/$sample.contigs.fasta && rm $directory/_temp/$sample.clean.nomatch.fastq && rm $directory/_temp/$sample.reads?.sub.fastq" -done -done - -# ALIGN AND ORIENT CONTIGS TO REFERENCES -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -for ref_name in zaire_guinea -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.c1 "touch $directory/_temp/$sample.s_segment1_merger_assembly.fa && perl /idi/sabeti-scratch/kandersen/bin/VfatSoftwarePackage/orientContig.pl $directory/_pileup/$sample.contigs.fasta /idi/sabeti-scratch/kandersen/references/annotations/ebola/$ref_name.fasta $directory/_temp/$sample.s_segment1 && perl /idi/sabeti-scratch/kandersen/bin/VfatSoftwarePackage/contigMerger.pl $directory/_temp/$sample.s_segment1_orientedContigs /idi/sabeti-scratch/kandersen/references/annotations/ebola/$ref_name.fasta -readfq $directory/_temp/$sample.clean.1.fastq -readfq2 $directory/_temp/$sample.clean.2.fastq -fakequals 30 $directory/_temp/$sample.s_segment1 && cat $directory/_temp/$sample.s_segment1*assembly.fa > $directory/_temp/$sample.s_segment1.fasta" -done -done -done - -# SORT CONTIGS BASED ON LENGTH -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.s1 "python /idi/sabeti-data/rsealfon/assembly_scripts/filter_short_seqs.py $directory/_temp/$sample.s_segment1.fasta 15000 $directory/_temp/$sample.s_segment1.bad.fasta $directory/_temp/$sample.s_segment1.good.fasta && rm $directory/_temp/$sample.*.fa $directory/_temp/$sample.*.txt $directory/_temp/$sample.*.qlx $directory/_temp/$sample.*.pdf $directory/_temp/$sample.*.R $directory/_temp/$sample.*.afa $directory/_temp/$sample.*.mfa" -done -done - -#-------- MAP EBOV READS TO MODIFIED CONTIGS USING NOVOALIGN --------# -# ALIGN REFERENCES AND CONTIGS THAT PASSED LENGTH FILTER -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -for ref_name in zaire_guinea -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.m1 "cat $directory/_temp/$sample.s_segment1.good.fasta /idi/sabeti-scratch/kandersen/references/annotations/ebola/$ref_name.fasta | /idi/sabeti-scratch/kandersen/bin/muscle/muscle -out $directory/_temp/$sample.s_alignment1.fasta -quiet" -done -done -done - -# CLEANUP CONTIGS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -for country in SEL -do -for year in 2014 -do -for ref_name in zaire_guinea -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.c1 "python /idi/sabeti-data/rsealfon/assembly_scripts/modified_contig/modify_contig.py -n EBOV-$country-$year-$sample --call-reference-ns yes --trim-ends yes --replace-5ends yes --replace-3ends yes --replace-length 20 --replace-end-gaps yes --remove-end-ns no --call-reference-ambiguous no $directory/_temp/$sample.s_alignment1.fasta $directory/_temp/$sample.s_segment2.fasta $ref_name && python /home/unix/dpark/dev/dpark-scripts/viral.py deambig_fasta $directory/_temp/$sample.s_segment2.fasta $directory/_temp/$sample.s_segment2a.fasta" -done -done -done -done -done - -# INDEX MODIFIED CONTIGS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.i1 "java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment2.fasta O=$directory/_temp/$sample.s_segment2.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_temp/$sample.s_segment2.nix $directory/_temp/$sample.s_segment2.fasta && samtools faidx $directory/_temp/$sample.s_segment2.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment2a.fasta O=$directory/_temp/$sample.s_segment2a.dict && samtools faidx $directory/_temp/$sample.s_segment2a.fasta" -done -done - -# NOVOALIGN EBOV READS TO MODIFIED CONTIGS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.a1 "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_temp/$sample.clean.1.fastq $directory/_temp/$sample.clean.2.fastq -r Random -l 30 -g 40 -x 20 -t 502 -F STDFQ -d $directory/_temp/$sample.s_segment2.nix -o SAM $'@RG\tID:$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' | samtools view -buS -q 1 - | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.ref_mapped_s1.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_temp/$sample.s_segment2a.fasta -I $directory/_temp/$sample.ref_mapped_s1.bam -o $directory/_temp/$sample.gatk_s1.vcf --baq OFF --useOriginalQualities -out_mode EMIT_ALL_SITES -dt NONE --num_threads 1 --min_base_quality_score 15 -ploidy 4 -stand_call_conf 0 -stand_emit_conf 0 -A AlleleBalance && python /home/unix/dpark/dev/dpark-scripts/viral.py vcf_to_fasta --trim_ends --min_coverage 2 $directory/_temp/$sample.gatk_s1.vcf $directory/_temp/$sample.s_segment3.fasta && python /home/unix/dpark/dev/dpark-scripts/viral.py deambig_fasta $directory/_temp/$sample.s_segment3.fasta $directory/_temp/$sample.s_segment3a.fasta" -done -done - -# INDEX CONSENSUS SEQUENCES -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.i1 "java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment3.fasta O=$directory/_temp/$sample.s_segment3.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_temp/$sample.s_segment3.nix $directory/_temp/$sample.s_segment3.fasta && samtools faidx $directory/_temp/$sample.s_segment3.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_temp/$sample.s_segment3a.fasta O=$directory/_temp/$sample.s_segment3a.dict && samtools faidx $directory/_temp/$sample.s_segment3a.fasta" -done -done - -# NOVOALIGN ALL READS TO CONSENSUS SEQUENCES -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -q week -R "rusage[mem=2]" -n 1 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.a1 "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -r Random -l 40 -g 40 -x 20 -t 100 -F STDFQ -d $directory/_temp/$sample.s_segment3.nix -o SAM $'@RG\tID:$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' | samtools view -buS -q 1 - | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.ref_mapped_s2.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_temp/$sample.s_segment3a.fasta -I $directory/_temp/$sample.ref_mapped_s2.bam -o $directory/_temp/$sample.gatk_s2.vcf --baq OFF --useOriginalQualities -out_mode EMIT_ALL_SITES -dt NONE --num_threads 1 --min_base_quality_score 15 -ploidy 4 -stand_call_conf 0 -stand_emit_conf 0 -A AlleleBalance && python /home/unix/dpark/dev/dpark-scripts/viral.py vcf_to_fasta --trim_ends --min_coverage 2 $directory/_temp/$sample.gatk_s2.vcf $directory/_temp/$sample.s_segment4.fasta" -done -done - -#-------- PREPARE FINAL MAPPED ASSEMBLY USING NOVOALIGN --------# -# INDEX FINAL CONSENSUS SEQUENCES -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_kga -J $sample.ix "cat $directory/_temp/$sample.s_segment4.fasta > $directory/_refs/$sample.fasta && java -Xmx2g -jar /seq/software/picard/current/bin/CreateSequenceDictionary.jar R=$directory/_refs/$sample.fasta O=$directory/_refs/$sample.dict && /idi/sabeti-scratch/kandersen/bin/novocraft/novoindex $directory/_refs/$sample.nix $directory/_refs/$sample.fasta && samtools faidx $directory/_refs/$sample.fasta" -done -done - -# ALIGN ALL READS TO ITS OWN EBOV CONSENSUS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -for date in 141018 -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -n 4 -R "span[hosts=1]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.al "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -k -c 3 -f $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -r Random -l 40 -g 40 -x 20 -t 100 -F STDFQ -d $directory/_refs/$sample.nix -o SAM $'@RG\tID:$date.$sample\tSM:$sample\tPL:Illumina\tPU:HiSeq\tLB:BroadPE\tCN:Broad' 2> $directory/_logs/$sample.log.novoalign.txt | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_bams/$sample.sorted.bam CREATE_INDEX=true && samtools view -b -q 1 -u $directory/_bams/$sample.sorted.bam | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.mapped.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && java -Xmx2g -jar /seq/software/picard/current/bin/MarkDuplicates.jar I=$directory/_temp/$sample.mapped.bam O=$directory/_temp/$sample.mappedNoDub.bam METRICS_FILE=$directory/_temp/$sample.log.markdups.txt CREATE_INDEX=true REMOVE_DUPLICATES=true" -done -done -done - -#-------- PREPARE ALIGNMENT AND CALCULATE METRICS --------# -# RUN FASTQC REPORT -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.dx "/idi/sabeti-scratch/kandersen/bin/fastqc/fastqc -f bam $directory/_bams/$sample.sorted.bam -o $directory/_reports/" -done -done - -# PERFORM LOCAL REALIGNMENT -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.lr "java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T RealignerTargetCreator -R $directory/_refs/$sample.fasta -o $directory/_temp/$sample.log.realigner.intervals -I $directory/_temp/$sample.mappedNoDub.bam && java -Xmx2g -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T IndelRealigner -R $directory/_refs/$sample.fasta -targetIntervals $directory/_temp/$sample.log.realigner.intervals -I $directory/_temp/$sample.mappedNoDub.bam -o $directory/_bams/$sample.realigned.bam" -done -done - -# CALCULATE COVERAGE & SNPS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.st "/idi/sabeti-scratch/kandersen/bin/bedtools/genomeCoverageBed -d -ibam $directory/_bams/$sample.realigned.bam -g $directory/_refs/$sample.fasta > $directory/_pileup/$sample.coverage.txt && bamtools stats -insert -in $directory/_bams/$sample.realigned.bam > $directory/_logs/$sample.log.bamstats_realigned.txt && java -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T UnifiedGenotyper -R $directory/_refs/$sample.fasta -I $directory/_bams/$sample.realigned.bam -o $directory/_temp/$sample.snps.vcf --baq OFF --useOriginalQualities -out_mode EMIT_VARIANTS_ONLY -dt NONE --num_threads 1 --min_base_quality_score 20 -ploidy 10 -stand_call_conf 80.0 -stand_emit_conf 30.0 -A AlleleBalance" -done -done - -# FILTER SNPS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.gk2 java -jar /humgen/gsa-hpprojects/GATK/bin/current/GenomeAnalysisTK.jar -T VariantFiltration -R $directory/_refs/$sample.fasta -V $directory/_temp/$sample.snps.vcf -o $directory/_pileup/$sample.snps.vcf -l ERROR --filterExpression "(MQ0/DP) > 0.20 || BaseQRankSum < -10.0 || QD < 1.0 || MQRankSum < -4.0 || ReadPosRankSum < -4.0" --filterName LowConfidence --filterExpression "DP < 20 || (DP*(1-ABHet)) < 5" --filterName LowCoverage --filterExpression "ABHet > 0.95" --filterName LowFrequency --filterExpression "SB < -1.0e+09 || SB > 1.0e+09 || FS > 5" --filterName StrandBias -done -done - -#-------- SHORT META ANALYSIS @ BROAD --------# -# SUBSAMPLE READS FOR LATER RETRIEVAL -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -R "rusage[mem=6]" -W 4:00 -o $directory/_logs/$sample.log.bsub.txt -P sabeti_meta -J $sample.ss "python /idi/sabeti-scratch/kandersen/bin/scripts/subsampler.py -n 1000000 -mode p -in $directory/_reads/$sample.reads1.fastq $directory/_reads/$sample.reads2.fastq -out $directory/_temp/$sample.reads1.sub.fastq $directory/_temp/$sample.reads2.sub.fastq" -done -done - -# COUNT SPIKE-INS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -q hour -R "rusage[mem=2]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.si "/idi/sabeti-scratch/kandersen/bin/novocraft_v3/novoalign -f $directory/_temp/$sample.reads1.sub.fastq $directory/_temp/$sample.reads2.sub.fastq -r Random -F STDFQ -d /idi/sabeti-scratch/kandersen/references/other/ercc_spike-ins.nix -o SAM 2> $directory/_temp/$sample.log.spike_novo.txt | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.spikes.bam CREATE_INDEX=true && samtools view -b -q 1 -u $directory/_temp/$sample.spikes.bam | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=$directory/_temp/$sample.spikes_mapped.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT && /idi/sabeti-scratch/kandersen/bin/scripts/CountAlignmentsByDescriptionLine -bam $directory/_temp/$sample.spikes_mapped.bam > $directory/_logs/$sample.log.spike_count.txt" -done -done - -# ALIGN TO HUMAN OR OTHER REFERENCE USING BWA -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -R "rusage[mem=8]" -W 4:00 -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.hg "perl /idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -lc_method dust -lc_threshold 7 -line_width 0 -fastq $directory/_temp/$sample.reads1.sub.fastq -out_good stdout -out_bad null | /idi/sabeti-scratch/kandersen/bin/bwa/bwa aln -q 5 -l 32 /idi/sabeti-scratch/kandersen/references/human/hg19 - | /idi/sabeti-scratch/kandersen/bin/bwa/bwa samse /idi/sabeti-scratch/kandersen/references/human/hg19 - $directory/_temp/$sample.reads1.sub.fastq | java -Xmx2g -jar /seq/software/picard/current/bin/SortSam.jar SO=coordinate I=/dev/stdin O=/dev/stdout VALIDATION_STRINGENCY=SILENT | samtools flagstat - > $directory/_logs/$sample.log.hg19.txt" -done -done - -# PERFORM BLASTN ANALYSIS -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -W 4:00 -R "rusage[mem=12]" -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.fa "/idi/sabeti-scratch/kandersen/bin/prinseq/prinseq-lite.pl -min_len 40 -seq_num 10000 -out_format 1 -line_width 0 -fastq $directory/_temp/$sample.reads1.sub.fastq -out_good $directory/_temp/$sample -out_bad null && blastn -db /idi/sabeti-scratch/kandersen/references/blast/nt -outfmt 6 -evalue 1e-10 -num_descriptions 30 -query $directory/_temp/$sample.fasta -out $directory/_temp/$sample.blastn.txt" -done -done - -# CREATE MEGAN FILES -# You will have to be running an xhost in order to do this step: open /Applications/Utilities/X11.app/ xhost + and export DISPLAY=:0 -for sample in G3789.1 G3823 G3823_r1 G3789.3 -do -for directory in /idi/sabeti-data/kandersen/analysis/141017_danny-ebov -do -bsub -R "rusage[mem=4]" -W 4:00 -o $directory/_logs/$sample.log.bsub.txt -P sabeti_align -J $sample.mg /idi/sabeti-scratch/kandersen/bin/megan/MEGAN -E +g -p "/idi/sabeti-scratch/kandersen/bin/megan/Megan.def" -x "load gi2taxfile="/idi/sabeti-scratch/kandersen/bin/megan/class/resources/files/gi_taxid_nucl.bin"; import blastfile="$directory/_temp/$sample.blastn.txt" fastafile="$directory/_temp/$sample.fasta" meganfile="$directory/_meta/$sample.rma" useseed=false usekegg=false; select nodes=all; uncollapse subtrees; update; exportimage file="$directory/_meta/$sample.pdf" format=pdf replace=true; quit;" -done -done From 7093626b08d7dfe81571133a100b71a0fab810f5 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Sun, 18 Jan 2015 15:14:56 -0500 Subject: [PATCH 02/12] compile note on mosaik [ci skip] --- docs/install.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/install.rst b/docs/install.rst index f8fabed00..fdaee3d9e 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -70,3 +70,7 @@ Alternatively, if you are using the Snakemake pipelines, you can create a dictionary called "env_vars" in the config.json file for Snakemake, and the pipelines will automatically set all environment variables prior to running any scripts. + +The version of MOSAIK we use seems to fail compile on GCC-4.9 but compiles +fine on GCC-4.4. We have not tried intermediate versions of GCC, nor the +latest versions of MOSAIK. From 5f33e7b5bae44f2c705c8286cd84a6d41aaa0e0e Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Sun, 18 Jan 2015 17:26:01 -0500 Subject: [PATCH 03/12] push more steps into python --- assembly.py | 21 +++++----- pipes/rules/assembly.rules | 14 ++----- read_utils.py | 84 ++++++++++++++++++++++++++++++-------- test/test_read_utils.py | 2 +- tools/picard.py | 7 ++++ tools/samtools.py | 9 ++-- 6 files changed, 96 insertions(+), 41 deletions(-) diff --git a/assembly.py b/assembly.py index b6dfb6219..079ab01ed 100755 --- a/assembly.py +++ b/assembly.py @@ -17,8 +17,9 @@ log = logging.getLogger(__name__) -def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=None): +def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): ''' Take reads through Trimmomatic, Prinseq, and subsampling. + This should probably move over to read_utils or taxon_filter. ''' # BAM -> fastq @@ -57,6 +58,9 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=No os.unlink(purgefq[1]) # Fastq -> BAM + # Note: this destroys RG IDs! We should instead just pull the list + # of IDs out of purge_unmated, random.sample them ourselves, and + # use FilterSamReadsTool to go straight from inBam -> outBam tmp_bam = util.file.mkstempfname('.subsamp.bam') tmp_header = util.file.mkstempfname('.header.sam') tools.picard.FastqToSamTool().execute( @@ -65,11 +69,6 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, outFastqs=No tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam) os.unlink(tmp_bam) os.unlink(tmp_header) - - # Save fastqs if requested - if outFastqs: - shutil.copyfile(subsampfq[0], outFastqs[0]) - shutil.copyfile(subsampfq[1], outFastqs[1]) os.unlink(subsampfq[0]) os.unlink(subsampfq[1]) @@ -83,14 +82,16 @@ def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None): subsamp_bam = outReads else: subsamp_bam = util.file.mkstempfname('.subsamp.bam') + + trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam, n_reads=n_reads) subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq'])) - trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam, - n_reads=n_reads, outFastqs=subsampfq) + tools.picard.SamToFastqTool().execute(subsamp_bam, subsampfq[0], subsampfq[1]) tools.trinity.TrinityTool().execute(subsampfq[0], subsampfq[1], outFasta) - if not outReads: - os.unlink(subsamp_bam) os.unlink(subsampfq[0]) os.unlink(subsampfq[1]) + + if not outReads: + os.unlink(subsamp_bam) def parser_assemble_trinity(parser=argparse.ArgumentParser()): parser.add_argument('inBam', diff --git a/pipes/rules/assembly.rules b/pipes/rules/assembly.rules index 0cd246f71..beba6f699 100644 --- a/pipes/rules/assembly.rules +++ b/pipes/rules/assembly.rules @@ -141,20 +141,14 @@ rule map_reads_to_self: {sample}.rmdup.bam - only mapped reads, with duplicates removed by Picard {sample}.realigned.bam - mapped/rmdup reads, realigned with GATK IndelRealigner ''' - input: config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta', - config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam' + input: config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam', + config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta' output: config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.bam', - config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam', - config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.rmdup.bam', - config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.realigned.bam' + config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam' resources: mem=4 params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), logid="{sample}", novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k -c 3" run: makedirs(os.path.join(config["dataDir"], config["subdirs"]["align_self"])) - shell("{config[binDir]}/read_utils.py novoalign {input[1]} {input[0]} {output[0]} --options '{params.novoalign_options}'") - shell("{config[binDir]}/read_utils.py filter_bam_mapped_only {output[0]} {output[1]}") - shell("{config[binDir]}/read_utils.py mkdup_picard {output[1]} {output[2]} --remove --picardOptions CREATE_INDEX=true") - shell("{config[binDir]}/read_utils.py gatk_realign {output[2]} {input[0]} {output[3]}") - + shell("{config[binDir]}/read_utils.py align_and_fix {input} --outBamAll {output[0]} --outBamFiltered {output[1]} --novoalign_options '{params.novoalign_options}'") diff --git a/read_utils.py b/read_utils.py index 24636161a..5fc870cfd 100755 --- a/read_utils.py +++ b/read_utils.py @@ -459,7 +459,7 @@ def split_bam(inBam, outBams) : # dump to bigsam bigsam = mkstempfname('.sam') - samtools.execute('view', [inBam], stdout=bigsam) + samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: @@ -648,28 +648,20 @@ def main_rmdup_prinseq_fastq(args): __commands__.append(('rmdup_prinseq_fastq', parser_rmdup_prinseq_fastq)) -def filter_bam_mapped_only(inBam, outBam, JVMmemory=None): - '''Samtools and Picard to reduce a BAM file to aligned reads only.''' - # filter to aligned-only with Samtools - tmp_bam = util.file.mkstempfname('.bam') - cmd = [tools.samtools.SamtoolsTool().install_and_get_path(), - 'view', '-b', '-1', '-q', '1', inBam] - log.debug(' '.join(cmd) +' > '+ tmp_bam) - with open(tmp_bam, 'wb') as outf: - subprocess.check_call(cmd, stdout=outf) - # fix headers and create index with Picard - tools.picard.SortSamTool().execute(tmp_bam, outBam, sort_order='coordinate', - picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'], - JVMmemory=JVMmemory) - os.unlink(tmp_bam) +def filter_bam_mapped_only(inBam, outBam): + ''' Samtools to reduce a BAM file to only reads that are + aligned (-F 4) with a non-zero mapping quality (-q 1) + and are not marked as a PCR/optical duplicate (-F 1024). + ''' + tools.samtools.SamtoolsTool().view( + ['-b', '-q', '1', '-F', '1028'], inBam, outBam) + tools.picard.BuildBamIndexTool().execute(outBam) return 0 def parser_filter_bam_mapped_only(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input aligned reads, BAM format.') parser.add_argument('outBam', help='Output sorted indexed reads, filtered to aligned-only, BAM format.') - parser.add_argument('--JVMmemory', default = tools.picard.SortSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, filter_bam_mapped_only, split_args=True) return parser @@ -751,6 +743,64 @@ def main_gatk_realign(args): __commands__.append(('gatk_realign', parser_gatk_realign)) +# ========================= + +def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, + novoalign_options='', JVMmemory=None): + ''' Take reads, align to reference with Novoalign, mark duplicates + with Picard, realign indels with GATK, and optionally filter + final file to mapped/non-dupe reads. + ''' + if not (outBamAll or outBamFiltered): + log.warn("are you sure you meant to do nothing?") + return + + bam_aligned = mkstempfname('.aligned.bam') + tools.novoalign.NovoalignTool().execute( + inBam, refFasta, bam_aligned, + options=novoalign_options.split(), JVMmemory=JVMmemory) + + bam_mkdup = mkstempfname('.mkdup.bam') + tools.picard.MarkDuplicatesTool().execute( + [bam_aligned], bam_mkdup, JVMmemory=JVMmemory) + os.unlink(bam_aligned) + + bam_realigned = mkstempfname('.realigned.bam') + tools.gatk.GATKTool().local_realign( + bam_mkdup, refFasta, bam_realigned, JVMmemory=JVMmemory) + os.unlink(bam_mkdup) + + if outBamAll: + shutil.copyfile(bam_realigned, outBamAll) + tools.picard.BuildBamIndexTool().execute(outBamAll) + if outBamFiltered: + tools.samtools.SamtoolsTool().view( + ['-b', '-q', '1', '-F', '1028'], + bam_realigned, outBamFiltered) + tools.picard.BuildBamIndexTool().execute(outBamFiltered) + os.unlink(bam_realigned) + + +def parser_align_and_fix(parser=argparse.ArgumentParser()): + parser.add_argument('inBam', + help='Input unaligned reads, BAM format.') + parser.add_argument('refFasta', + help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.') + parser.add_argument('--outBamAll', default = None, + help='''Aligned, sorted, and indexed reads. Unmapped reads are + retained and duplicate reads are marked, not removed.''') + parser.add_argument('--outBamFiltered', default = None, + help='''Aligned, sorted, and indexed reads. Unmapped reads and + duplicate reads are removed from this file.''') + parser.add_argument('--novoalign_options', default = '-r Random', + help='Novoalign options (default: %(default)s)') + parser.add_argument('--JVMmemory', default = '4g', + help='JVM virtual memory size (default: %(default)s)') + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) + util.cmd.attach_main(parser, align_and_fix, split_args=True) + return parser +__commands__.append(('align_and_fix', parser_align_and_fix)) + # ========================= def full_parser(): diff --git a/test/test_read_utils.py b/test/test_read_utils.py index af9948bfe..91b73a048 100644 --- a/test/test_read_utils.py +++ b/test/test_read_utils.py @@ -92,7 +92,7 @@ def test_fastq_bam(self) : # samtools view for out.sam and compare to expected samtools = tools.samtools.SamtoolsTool() - samtools.execute('view', ['-h', outBamCmd], stdout=outSam) + samtools.view(['-h'], outBamCmd, outSam) # picard.sam.FastqToSam outputs header fields in different order for # java version 1.8 vs 1.7/1.6, so compare both self.assertTrue(filecmp.cmp(outSam, expected1_7Sam, shallow=False) or diff --git a/tools/picard.py b/tools/picard.py index 7ec47bdc5..b84354bad 100644 --- a/tools/picard.py +++ b/tools/picard.py @@ -146,6 +146,13 @@ def execute(self, inFasta, outDict=None, overwrite=False, opts = ['REFERENCE='+inFasta, 'OUTPUT='+outDict] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) +class BuildBamIndexTool(PicardTools) : + subtoolName = 'BuildBamIndex' + jvmMemDefault = '512m' + def execute(self, inBam, picardOptions=[], JVMmemory=None) : + opts = ['INPUT='+inBam] + PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) + class ExtractIlluminaBarcodesTool(PicardTools) : subtoolName = 'ExtractIlluminaBarcodes' jvmMemDefault = '8g' diff --git a/tools/samtools.py b/tools/samtools.py index 225d3325d..df35f0062 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -45,6 +45,9 @@ def execute(self, command, args, stdin=None, stdout=None): stdin.close() if stdout: stdout.close() + + def view(self, args, inFile, outFile, regions=[]): + self.execute('view', args + ['-o', outFile, inFile] + regions) def faidx(self, inFasta, overwrite=False): ''' Index reference genome for samtools ''' @@ -62,11 +65,11 @@ def reheader(self, inBam, headerFile, outBam): def dumpHeader(self, inBam, outHeader) : if inBam.endswith('.bam'): - opts = ['-H', inBam] + opts = ['-H'] elif inBam.endswith('.sam'): - opts = ['-H', '-S', inBam] + opts = ['-H', '-S'] #header = pysam.view(*opts) - self.execute('view', opts, stdout=outHeader) + self.view(opts, inBam, outHeader) def getHeader(self, inBam): tmpf = util.file.mkstempfname('.txt') From 1aa10e58c9a97afcaf599224e83869b440869f93 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 15:56:45 -0500 Subject: [PATCH 04/12] bugfix: forgot to index mkdup output --- read_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/read_utils.py b/read_utils.py index 5fc870cfd..f50b54022 100755 --- a/read_utils.py +++ b/read_utils.py @@ -762,7 +762,8 @@ def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, bam_mkdup = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( - [bam_aligned], bam_mkdup, JVMmemory=JVMmemory) + [bam_aligned], bam_mkdup, + picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory) os.unlink(bam_aligned) bam_realigned = mkstempfname('.realigned.bam') From 70ada6db3815b640b7a55b618d0a2b72b3142e20 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 16:10:44 -0500 Subject: [PATCH 05/12] placeholder for more documentation [ci skip] --- docs/description.rst | 33 +++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 34 insertions(+) create mode 100644 docs/description.rst diff --git a/docs/description.rst b/docs/description.rst new file mode 100644 index 000000000..2c5305ee1 --- /dev/null +++ b/docs/description.rst @@ -0,0 +1,33 @@ +Description of the methods +========================== + +Much more documentation to come... + +TO DO: here we will put a high level description of the various tools that +exist here, perhaps with some pictures and such. We will describe why we +used certain tools and approaches / how other approaches fell short / what +kinds of problems certain steps are trying to solve. Perhaps some links to +papers and such. Kind of a mini-methods paper here. + + +Viral genome analysis +--------------------- + +*De novo* assembly, reference assisted assembly improvements, +gene annotaion, species-level variation, within-host variation, etc. + + +Taxonomic read filtration +------------------------- + +Especially human read depletion (prior to submission to NCBI SRA). +But also the part where we restrict to a particular taxa of interest +(the species you're studying). + + +Taxonomic read identification +----------------------------- + +Nothing much here at the moment. That comes later, but we will later +integrate it when it's ready. + diff --git a/docs/index.rst b/docs/index.rst index 363ad264a..ffe89dee3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,6 +14,7 @@ Contents :maxdepth: 2 :numbered: + description install cmdline pipeuse From 0dfcaf68979829abb75d9271dd630ac76b1f4519 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 19:52:28 -0500 Subject: [PATCH 06/12] show github version in sphinx docs footer [ci skip] --- .gitignore | 2 ++ docs/_templates/layout.html | 27 +++++++++++++++++++++++++++ docs/conf.py | 18 +++++++++++++----- 3 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 docs/_templates/layout.html diff --git a/.gitignore b/.gitignore index d67bbced2..8b866c87b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ VERSION test/output/ + +# Sphinx documentation docs/_build/ # Mac OSX diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 000000000..3b353695b --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,27 @@ +{% extends '!layout.html' %} + +{% block footer %} + +
+ +
+

Code version: {{ release }}

+ +

+ {%- if show_copyright %} + {%- if hasdoc('copyright') %} + {% trans path=pathto('copyright'), copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} + {%- else %} + {% trans copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} + {%- endif %} + {%- endif %} + + {%- if last_updated %} + {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %} + {%- endif %} +

+
+ + {% trans %}Built with Sphinx using a theme provided by Read the Docs{% endtrans %}. + +{% endblock %} diff --git a/docs/conf.py b/docs/conf.py index c5f21604b..4a9036143 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,14 +21,22 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.dirname(os.path.abspath('.'))) -import util.version - # -- Mock out the heavyweight pip packages, esp those that require C ---- import mock MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'pysam', 'Bio', 'Bio.AlignIO', 'Bio.SeqIO', 'Bio.Data.IUPACData'] for mod_name in MOCK_MODULES: sys.modules[mod_name] = mock.Mock() + +# -- Obtain GIT version -- +import subprocess +def _git_version(): + cmd = ['git', 'describe', '--tags', '--always'] # omit "--dirty" from doc build + out = subprocess.check_output(cmd) + if type(out) != str: + out = out.decode('utf-8') + __version__ = out.strip() +_git_version() # -- General configuration ------------------------------------------------ @@ -65,7 +73,7 @@ # |version| and |release|, also used in various other places throughout the # built documents. # -release = util.version.get_version() +release = __version__ version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation @@ -76,7 +84,7 @@ # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +today_fmt = '%Y-%m-%d' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -149,7 +157,7 @@ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +html_last_updated_fmt = '%Y-%m-%d' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. From ebc0721920eec226b0e631e2cba35f3250ace64f Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 19:53:49 -0500 Subject: [PATCH 07/12] fix git versioning in sphinx conf.py [ci skip] --- docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 4a9036143..92bcec419 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,8 +35,8 @@ def _git_version(): out = subprocess.check_output(cmd) if type(out) != str: out = out.decode('utf-8') - __version__ = out.strip() -_git_version() + return out.strip() +__version__ = _git_version() # -- General configuration ------------------------------------------------ From bbaa6d6ce52c45935fc11891ebeed856b889799d Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 20:16:04 -0500 Subject: [PATCH 08/12] simplify footer override [ci skip] --- docs/_templates/layout.html | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html index 3b353695b..be1a9719c 100644 --- a/docs/_templates/layout.html +++ b/docs/_templates/layout.html @@ -1,27 +1,6 @@ {% extends '!layout.html' %} {% block footer %} - -
- -
-

Code version: {{ release }}

- -

- {%- if show_copyright %} - {%- if hasdoc('copyright') %} - {% trans path=pathto('copyright'), copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} - {%- else %} - {% trans copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} - {%- endif %} - {%- endif %} - - {%- if last_updated %} - {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %} - {%- endif %} -

-
- - {% trans %}Built with Sphinx using a theme provided by Read the Docs{% endtrans %}. - +{{ super() }} +

Code version: {{ release }}

{% endblock %} From cfc70ebacf1cbfb318e3b76fa5094b4ef8efe1e1 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 20:32:45 -0500 Subject: [PATCH 09/12] can we inject arbitrary text to the doc footer? [ci skip] --- docs/_templates/layout.html | 6 ------ docs/conf.py | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 docs/_templates/layout.html diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html deleted file mode 100644 index be1a9719c..000000000 --- a/docs/_templates/layout.html +++ /dev/null @@ -1,6 +0,0 @@ -{% extends '!layout.html' %} - -{% block footer %} -{{ super() }} -

Code version: {{ release }}

-{% endblock %} diff --git a/docs/conf.py b/docs/conf.py index 92bcec419..f843b4614 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -157,7 +157,7 @@ def _git_version(): # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -html_last_updated_fmt = '%Y-%m-%d' +html_last_updated_fmt = '%Y-%m-%d. Code version {}.'.format(release) # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. From 0bff064aec5f244211e80c13ce26172b2f973f0f Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 20:33:11 -0500 Subject: [PATCH 10/12] apparently, valid rST means that the section underline must match the length of the section name [ci skip] --- docs/broad_utils.rst | 2 +- docs/interhost.rst | 2 +- docs/read_utils.rst | 2 +- docs/reports.rst | 2 +- docs/taxon_filter.rst | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/broad_utils.rst b/docs/broad_utils.rst index f33d24320..f880c26b8 100644 --- a/docs/broad_utils.rst +++ b/docs/broad_utils.rst @@ -1,5 +1,5 @@ broad_utils.py - for data generated at the Broad Institute -===================================== +========================================================== .. argparse:: :module: broad_utils diff --git a/docs/interhost.rst b/docs/interhost.rst index 2c6c7e052..6b076c5e1 100644 --- a/docs/interhost.rst +++ b/docs/interhost.rst @@ -1,5 +1,5 @@ interhost.py - species and population-level genetic variation -===================================== +============================================================= .. argparse:: :module: interhost diff --git a/docs/read_utils.rst b/docs/read_utils.rst index 2d46eee99..57b5f3ee3 100644 --- a/docs/read_utils.rst +++ b/docs/read_utils.rst @@ -1,5 +1,5 @@ read_utils.py - utilities that manipulate bam and fastq files -===================================== +============================================================= .. argparse:: :module: read_utils diff --git a/docs/reports.rst b/docs/reports.rst index 46d61aca4..e5afd7df9 100644 --- a/docs/reports.rst +++ b/docs/reports.rst @@ -1,5 +1,5 @@ reports.py - produce various metrics and reports -===================================== +================================================ .. argparse:: :module: reports diff --git a/docs/taxon_filter.rst b/docs/taxon_filter.rst index e185e0dea..fe7a60e0b 100644 --- a/docs/taxon_filter.rst +++ b/docs/taxon_filter.rst @@ -1,5 +1,5 @@ taxon_filter.py - tools for taxonomic removal or filtration of reads -===================================== +==================================================================== .. argparse:: :module: taxon_filter From aceea7513605c41b983b214375e1011c2962b723 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 20:35:51 -0500 Subject: [PATCH 11/12] footer text injection works, just clean it up a bit [ci skip] --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f843b4614..58c0bd012 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -157,7 +157,7 @@ def _git_version(): # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -html_last_updated_fmt = '%Y-%m-%d. Code version {}.'.format(release) +html_last_updated_fmt = '%Y-%m-%d.
Code version {}'.format(release) # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. From 43e5368b0c65fe343025573fad635a2b2eede757 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 19 Jan 2015 20:38:33 -0500 Subject: [PATCH 12/12] clean up RTD footer [ci skip] --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 58c0bd012..893fc5267 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -157,7 +157,7 @@ def _git_version(): # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -html_last_updated_fmt = '%Y-%m-%d.
Code version {}'.format(release) +html_last_updated_fmt = '%Y-%m-%d. {}'.format(release) # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities.