Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3308 Support multiple taxonomy ids when remapping #10

Merged
merged 8 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions bin/add_target_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ def main():

load_config()

if not args.taxonomy or not args.target_assembly or not args.release_version:
raise ArgumentError(None, 'Must provide --taxonomy, --target_assembly, and --release_version')

job = AssemblyIngestionJob(args.taxonomy, args.target_assembly, args.release_version)
logging_config.add_stdout_handler()

Expand Down
204 changes: 118 additions & 86 deletions eva_assembly_ingestion/assembly_ingestion_job.py

Large diffs are not rendered by default.

20 changes: 13 additions & 7 deletions eva_assembly_ingestion/nextflow/remap_cluster.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def helpMessage() {
Remap one assembly version to another, cluster, and QC.

Inputs:
--taxonomy_id taxonomy id of submitted variants that needs to be remapped.
--taxonomy_list list of taxonomy id of submitted variants that needs to be remapped.
--source_assembly_accession assembly accession of the submitted variants are currently mapped to.
--target_assembly_accession assembly accession the submitted variants will be remapped to.
--species_name scientific name to be used for the species.
Expand All @@ -35,8 +35,8 @@ params.help = null
if (params.help) exit 0, helpMessage()

// Test input files
if (!params.taxonomy_id || !params.source_assembly_accession || !params.target_assembly_accession || !params.species_name || !params.genome_assembly_dir ) {
if (!params.taxonomy_id) log.warn('Provide the taxonomy id of the source submitted variants using --taxonomy_id')
if (!params.taxonomy_list || !params.source_assembly_accession || !params.target_assembly_accession || !params.species_name || !params.genome_assembly_dir ) {
if (!params.taxonomy_list) log.warn('Provide the taxonomy id of the source submitted variants using --taxonomy_list')
if (!params.source_assembly_accession) log.warn('Provide the source assembly using --source_assembly_accession')
if (!params.target_assembly_accession) log.warn('Provide the target assembly using --target_assembly_accession')
if (!params.species_name) log.warn('Provide a species name using --species_name')
Expand Down Expand Up @@ -123,7 +123,7 @@ process update_target_genome {


/*
* Extract the submitted variants to remap from the accesioning warehouse and store them in a VCF file.
* Extract the submitted variants to remap from the accessioning warehouse and store them in a VCF file.
*/
process extract_vcf_from_mongo {
memory "${params.memory}GB"
Expand All @@ -132,11 +132,12 @@ process extract_vcf_from_mongo {
input:
path source_fasta
path source_report
each taxonomy

output:
// Store both vcfs (eva and dbsnp), emit: one channel
path '*.vcf', emit: source_vcfs
path "${params.source_assembly_accession}_vcf_extractor.log", emit: log_filename
path "${params.source_assembly_accession}_${taxonomy}_vcf_extractor.log", emit: log_filename

publishDir "$params.output_dir/logs", overwrite: true, mode: "copy", pattern: "*.log*"

Expand All @@ -145,7 +146,8 @@ process extract_vcf_from_mongo {
--spring.config.location=file:${params.extraction_properties} \
--parameters.fasta=${source_fasta} \
--parameters.assemblyReportUrl=file:${source_report} \
> ${params.source_assembly_accession}_vcf_extractor.log
--parameters.taxonomy=${taxonomy}
> ${params.source_assembly_accession}_${taxonomy}_vcf_extractor.log
"""
}

Expand Down Expand Up @@ -330,7 +332,11 @@ workflow {
update_source_genome(params.source_assembly_accession, retrieve_source_genome.out.source_fasta,
retrieve_source_genome.out.source_report, params.remapping_config)
update_target_genome(retrieve_target_genome.out.target_fasta, retrieve_target_genome.out.target_report, params.remapping_config)
extract_vcf_from_mongo(update_source_genome.out.updated_source_fasta, update_source_genome.out.updated_source_report)
extract_vcf_from_mongo(
update_source_genome.out.updated_source_fasta,
update_source_genome.out.updated_source_report,
params.taxonomy_list
)
remap_variants(extract_vcf_from_mongo.out.source_vcfs.flatten(), update_source_genome.out.updated_source_fasta,
update_target_genome.out.updated_target_fasta)
ingest_vcf_into_mongo(remap_variants.out.remapped_vcfs, update_target_genome.out.updated_target_report)
Expand Down
15 changes: 11 additions & 4 deletions tests/nextflow-tests/java/FakeExtractionPipeline.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ public class FakeExtractionPipeline {

public static void main(String[] args) {
String outString = "java -jar extraction.jar";
String inFile = null;
String accession = null;
String taxonomy = null;
for (String arg: args) {
outString += " " + arg;
if (arg.startsWith("--parameters.fasta="))
inFile = arg.substring("--parameters.fasta=".length(), arg.length()-"_custom.fa".length());
if (arg.startsWith("--parameters.fasta=")){
accession = arg.substring("--parameters.fasta=".length(), arg.length()-"_custom.fa".length());
}
if (arg.startsWith("--parameters.taxonomy=")){
taxonomy = arg.substring("--parameters.taxonomy=".length(), arg.length());
}
}
System.out.println(outString);
System.out.println(inFile);
System.out.println(accession);
System.out.println(taxonomy);
String inFile = accession + "_" + taxonomy;

// real pipeline gets this from properties
String outFile1 = inFile + "_dbsnp.vcf";
Expand Down
25 changes: 15 additions & 10 deletions tests/nextflow-tests/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ PATH=${SCRIPT_DIR}/bin:$PATH

printf "\e[32m===== REMAPPING AND CLUSTERING PIPELINE =====\e[0m\n"
nextflow run ${SOURCE_DIR}/eva_assembly_ingestion/nextflow/remap_cluster.nf -params-file test_config.yaml \
--taxonomy_id 1234 \
--source_assembly_accession GCA_0000001 \
--target_assembly_accession GCA_0000002 \
--species_name "Thingy thungus" \
Expand All @@ -27,15 +26,21 @@ nextflow run ${SOURCE_DIR}/eva_assembly_ingestion/nextflow/remap_cluster.nf -par
--remapping_required 1 \
--memory 2

ls ${SCRIPT_DIR}/output/dbsnp/GCA_0000001_dbsnp_remapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_dbsnp_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_dbsnp_remapped_counts.yml \
${SCRIPT_DIR}/output/eva/GCA_0000001_eva_remapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_eva_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_eva_remapped_counts.yml

# Test we have 7 log files in the logs directory (1 extraction, 2 ingestion, 3 clustering, 1 backpropagate)
[[ $(find ${SCRIPT_DIR}/output/logs/ -type f -name "*.log" | wc -l) -eq 7 ]]
ls ${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1233_dbsnp_remapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1233_dbsnp_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1233_dbsnp_remapped_counts.yml \
${SCRIPT_DIR}/output/eva/GCA_0000001_1233_eva_remapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1233_eva_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1233_eva_remapped_counts.yml \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1234_dbsnp_remapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1234_dbsnp_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1234_dbsnp_remapped_counts.yml \
${SCRIPT_DIR}/output/eva/GCA_0000001_1234_eva_remapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1234_eva_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1234_eva_remapped_counts.yml

# Test we have 10 log files in the logs directory (2 extraction, 4 ingestion, 3 clustering, 1 backpropagate)
[[ $(find ${SCRIPT_DIR}/output/logs/ -type f -name "*.log" | wc -l) -eq 10 ]]

# Test we have 1 rs_report in the logs directory
[[ $(find ${SCRIPT_DIR}/output/logs/ -type f -name "*.txt" | wc -l) -eq 1 ]]
Expand Down
1 change: 1 addition & 0 deletions tests/nextflow-tests/test_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
taxonomy_list: [1233, 1234]

executable:
genome_downloader: ../../../bin/fake_genome_downloader.py
Expand Down
Loading