diff --git a/.github/workflows/pipeline_db.yaml b/.github/workflows/pipeline_db.yaml index 834c4de1b..51e31481b 100644 --- a/.github/workflows/pipeline_db.yaml +++ b/.github/workflows/pipeline_db.yaml @@ -9,10 +9,6 @@ env: CONDA_ENVS_PATH: "/home/runner/miniconda/envs:/usr/share/miniconda/envs" CONDA_PKGS_DIRS: "/home/runner/miniconda/pkgs" GH_RESOURCES: "--max_memory 6.GB --max_cpus 2" - SQLITE_CONFIG: "/home/runner/.nextflow/assets/ktmeaton/plague-phylogeography/config/ncbimeta.yaml" - SQLITE_TABLE: "BioSample" - SQLITE_BACKUP: "/home/runner/.nextflow/assets/ktmeaton/plague-phylogeography/results/ncbimeta_db/update/latest/yersinia_pestis_db_BioSample.tsv" - SQLITE_TEST_DB: "/home/runner/work/plague-phylogeography/plague-phylogeography/test/ncbimeta_db/update/latest/output/database/yersinia_pestis_db.sqlite" #------------------------------------------------------------------------------# # Workflow conditions on: @@ -74,18 +70,21 @@ jobs: shell: bash -l {0} run: scripts/install.sh ${{github.repository}} ${{ github.sha }} #------------------------------------------------------------------------# - # Create the database + # Create the example database - name: pipeline db create shell: bash -l {0} run: | conda activate ${PHYLO_CONDA_ENV} nextflow run -r ${{ github.sha }} ${{github.repository}} \ - --ncbimeta_create ${SQLITE_CONFIG} \ + --ncbimeta_create example/ncbimeta_test.yaml \ --ncbimeta_api_param "--force-pause-seconds 0 --email ${{secrets.EMAIL}} --api ${{secrets.NCBI_API}}" \ + --ncbimeta_output_dir test \ + --ncbimeta_sqlite_db test.sqlite \ --skip_sqlite_import \ --skip_reference_download \ --skip_outgroup_download \ - --outdir test + --outdir test \ + ${GH_RESOURCES} conda deactivate #------------------------------------------------------------------------# # Update the database @@ -94,13 +93,16 @@ jobs: run: | conda activate ${PHYLO_CONDA_ENV} nextflow run -r ${{ github.sha }} ${{github.repository}} \ - --ncbimeta_update ${SQLITE_CONFIG} \ - --ncbimeta_annot ${SQLITE_BACKUP} \ - --ncbimeta_annot_table ${SQLITE_TABLE} \ - --sqlite ${SQLITE_TEST_DB} \ - --skip_sqlite_import \ - --skip_reference_download \ - --skip_outgroup_download \ - --outdir test \ - -resume + --ncbimeta_update example/ncbimeta_test.yaml \ + --ncbimeta_sqlite_db test.sqlite \ + --ncbimeta_output_dir test \ + --ncbimeta_annot example/ncbimeta_test_annot.txt \ + --ncbimeta_annot_table BioSample \ + --sqlite test.sqlite \ + --skip_sqlite_import \ + --skip_reference_download \ + --skip_outgroup_download \ + --outdir test \ + -resume \ + ${GH_RESOURCES} conda deactivate diff --git a/custom/GCA_009669545.1_ASM966954v1_genomic.fna b/example/GCA_009669545.1_ASM966954v1_genomic.fna similarity index 100% rename from custom/GCA_009669545.1_ASM966954v1_genomic.fna rename to example/GCA_009669545.1_ASM966954v1_genomic.fna diff --git a/custom/GCA_009669555.1_ASM966955v1_genomic.fna b/example/GCA_009669555.1_ASM966955v1_genomic.fna similarity index 100% rename from custom/GCA_009669555.1_ASM966955v1_genomic.fna rename to example/GCA_009669555.1_ASM966955v1_genomic.fna diff --git a/custom/GCA_009669565.1_ASM966956v1_genomic.fna b/example/GCA_009669565.1_ASM966956v1_genomic.fna similarity index 100% rename from custom/GCA_009669565.1_ASM966956v1_genomic.fna rename to example/GCA_009669565.1_ASM966956v1_genomic.fna diff --git a/custom/SAMN02442718/paired/SRR1048902_1.fastq.gz b/example/SAMN02442718/paired/SRR1048902_1.fastq.gz similarity index 100% rename from custom/SAMN02442718/paired/SRR1048902_1.fastq.gz rename to example/SAMN02442718/paired/SRR1048902_1.fastq.gz diff --git a/custom/SAMN02442718/paired/SRR1048902_2.fastq.gz b/example/SAMN02442718/paired/SRR1048902_2.fastq.gz similarity index 100% rename from custom/SAMN02442718/paired/SRR1048902_2.fastq.gz rename to example/SAMN02442718/paired/SRR1048902_2.fastq.gz diff --git a/custom/SAMN02442721/paired/SRR1048905_1.fastq.gz b/example/SAMN02442721/paired/SRR1048905_1.fastq.gz similarity index 100% rename from custom/SAMN02442721/paired/SRR1048905_1.fastq.gz rename to example/SAMN02442721/paired/SRR1048905_1.fastq.gz diff --git a/custom/SAMN02442721/paired/SRR1048905_2.fastq.gz b/example/SAMN02442721/paired/SRR1048905_2.fastq.gz similarity index 100% rename from custom/SAMN02442721/paired/SRR1048905_2.fastq.gz rename to example/SAMN02442721/paired/SRR1048905_2.fastq.gz diff --git a/custom/local_data_eager.tsv b/example/local_data_eager.tsv similarity index 100% rename from custom/local_data_eager.tsv rename to example/local_data_eager.tsv diff --git a/example/ncbimeta_test.yaml b/example/ncbimeta_test.yaml new file mode 100644 index 000000000..ea1582088 --- /dev/null +++ b/example/ncbimeta_test.yaml @@ -0,0 +1,247 @@ +# User Configuration Variables +OUTPUT_DIR : test +EMAIL : myusername@domain.com +API_KEY : +FORCE_PAUSE_SECONDS : 1 +DATABASE : test.sqlite + +# NCBI Tables to Query +TABLES : + - Assembly + - BioSample + - BioProject + - SRA + - Nucleotide + - Pubmed + +# Query Terms to Use +SEARCH_TERMS : + - Assembly : (SAMN12991206[BioSample]) + - BioProject : (PRJNA269675[BioProject IDs and Accessions]) + - BioSample: (SAMN12991206[Accession]) + - SRA : (SAMN12991206[BioSample]) + - Nucleotide : (SAMN12991206[BioSample]) + - Pubmed : (26634751[uid]) + +# Columns of the database (ie. metadata fields to retrieve) +TABLE_COLUMNS : + + - Assembly : + - AssemblyAccession : AssemblyAccession + - AssemblyBioSampleAccession : BioSampleAccn + - AssemblyBioSampleID : BioSampleId + - AssemblyGenbankBioprojectAccession : GB_BioProjects, BioprojectAccn + - AssemblyGenbankID : GbUid + - AssemblyRefseqBioprojectAccession : RS_BioProjects, BioprojectAccn + - AssemblyRefSeqCategory : RefSeq_category + - AssemblyRefSeqID : RsUid + - AssemblyWGSAccession : WGS + - AssemblyInfraspecies : InfraspeciesList, Sub_value + - AssemblyIsolate : Isolate + - AssemblyOrganism : Organism + - AssemblySpeciesTaxonomicID : SpeciesTaxid + - AssemblySpeciesName : SpeciesName + - AssemblyTaxonomicID : Taxid + - AssemblyName : AssemblyName + - AssemblyStatus : AssemblyStatus + - AssemblyType : AssemblyType + - AssemblyCoverage : Coverage + - AssemblyChromosomes : Meta, Stat, category, chromosome_count + - AssemblyContigCount: Meta, Stat, category, contig_count + - AssemblyContigN50 : Meta, Stat, category, contig_n50 + - AssemblyContigL50 : Meta, Stat, category, contig_l50 + - AssemblyNonChromosomalReplicons : Meta, Stat, category, non_chromosome_replicon_count + - AssemblyReplicons : Meta, Stat, category, replicon_count + - AssemblyScaffolds : Meta, Stat, category, scaffold_count + - AssemblyScaffoldN50 : Meta, Stat, category, scaffold_n50 + - AssemblyScaffoldL50 : Meta, Stat, category, scaffold_l50 + - AssemblyTotalLength : Meta, Stat, category, total_length + - AssemblyUngappedLength : Meta, Stat, category, ungapped_length + - AssemblySubmitterOrganization : SubmitterOrganization + - AssemblySubmissionDate : SubmissionDate + - AssemblyReleaseDate : SeqReleaseDate + - AssemblyFTPAssemblyReport : FtpPath_Assembly_rpt + - AssemblyFTPGenbank : FtpPath_GenBank + - AssemblyFTPRefSeq : FtpPath_RefSeq + - AssemblyFTPStatsReport : FtpPath_Stats_rpt + - AssemblyComment : NullValue + + - BioSample : + - BioSampleAccession: BioSample, accession + - BioSampleAccessionSecondary: NullValue + - BioSampleBioProjectAccession: XPATH, //Links/Link[@target='bioproject']/@label + - BioSampleSRAAccession: Id, db, SRA + - BioSampleTitle: Title + - BioSampleName: Id, db_label, Sample name + - BioSampleType: Attribute, harmonized_name, sample_type + - BioSamplePackage: Package + - BioSampleInfraspecies: Infraspecies + - BioSampleOrganism: Description, OrganismName + - BioSampleOrganismAlt: Description, Organism, taxonomy_name + - BioSampleSubSpecies: Attribute, harmonized_name, sub_species + - BioSampleStrain: Attribute, harmonized_name, strain + - BioSampleTaxonomyID: Organism, taxonomy_id + - BioSampleBiovar: Attribute, harmonized_name, biovar + - BioSampleSerovar: Attribute, harmonized_name, serovar + - BioSampleCollectionDate: Attribute, harmonized_name, collection_date + - BioSampleGeographicLocation: Attribute, harmonized_name, geo_loc_name + - BioSampleHost: Attribute, harmonized_name, host + - BioSampleHostDisease : Attribute, harmonized_name, host_disease + - BioSampleHostHealthState : Attribute, harmonized_name, host_health_state + - BioSampleIsolateNameAlias: Attribute, harmonized_name, isolate_name_alias + - BioSampleIsolationSource: Attribute, harmonized_name, isolation_source + - BioSampleLat : Attribute, attribute_name, Latitude + - BioSampleLatLon : Attribute, harmonized_name, lat_lon + - BioSampleLon : Attribute, attribute_name, Longitude + - BioSampleSubmissionDate: BioSample, submission_date + - BioSampleModificationDate: BioSample, last_update + - BioSamplePublicationDate: BioSample, publication_date + - BioSampleOrganization: Owner, Name + - BioSampleComment : NullValue + + - BioProject : + - BioProjectAccession : ArchiveID, accession + - BioProjectDataType : ProjectDataTypeSet, DataType + - BioProjectDescription : ProjectDescr, Description + - BioProjectMethodType : Method, method_type + - BioProjectName : ProjectDescr, Name + - BioProjectTargetCapture : Target, capture + - BioProjectTargetMaterial : Target, material + - BioProjectTargetScope : Target, sample_scope + - BioProjectTitle : XPATH, //ProjectDescr/Title/text() + - BioProjectOrganismLabel : ProjectTypeSubmission, Label + - BioProjectOrganismStrain : ProjectTypeSubmission, Strain + - BioProjectOrganismTaxID : ProjectTypeSubmission, Organism, taxID + - BioProjectSpeciesTaxID : ProjectTypeSubmission, Organism, species + - BioProjectSupergroup : ProjectTypeSubmission, Supergroup + - BioProjectRegistrationDate : Submission, submitted + - BioProjectReleaseDate: ProjectReleaseDate + - BioProjectModificationDate: Submission, last_update + - BioProjectRelevanceMedical : Relevance, Medical + - BioProjectSubmitterOrganization : Organization, Name + - BioProjectPublished: Publication, status + - BioProjectDatePublished: Publication, date + - BioProjectPublicationID: Publication, id + - BioProjectPublicationDB: Publication, DbType + - BioProjectComment : NullValue + + - Nucleotide : + - NucleotideAccession : GBSeq_primary-accession + - NucleotideAccessionVersion : GBSeq_accession-version + - NucleotideBioSampleAccession: XPATH, //GBXref[GBXref_dbname/text() = 'BioSample']/GBXref_id + - NucleotideBioProjectAccession : GBSeq_project + - NucleotideOrganism : GBSeq_organism + - NucleotideTaxonomy : GBSeq_taxonomy + - NucleotideDefinition : GBSeq_definition + - NucleotideDivision : GBSeq_division + - NucleotideReferenceJournal: GBSeq_references, GBReference_journal + - NucleotideReferenceTitle : GBSeq_references, GBReference_title + - NucleotideReferenceAuthors : GBSeq_references, GBReference_authors, GBAuthor + - NucleotideLength : GBSeq_length + - NucleotideMoleculeType : GBSeq_moltype + - NucleotideSeqDataName : GBSeq_alt-seq, GBAltSeqData_name + - NucleotideSource : GBSeq_source + - NucleotideStrandedness : GBSeq_strandedness + - NucleotideTopology : GBSeq_topology + - NucleotideCreateDate : GBSeq_create-date + - NucleotideUpdateDate : GBSeq_update-date + - NucleotideGenBankComment : GBSeq_comment + - NucleotideAnnotationDate: Annotation Date + - NucleotideAnnotationMethod: Annotation Method + - NucleotideAnnotationPipeline: Annotation Pipeline + - NucleotideAnnotationProvider: Annotation Provider + - NucleotideAnnotationSoftwarerevision: Annotation Software revision + - NucleotideAssemblyDate: Assembly Date + - NucleotideAssemblyMethod: Assembly Method + - NucleotideAssemblyName: Assembly Name + - NucleotideCDS: CDS + - NucleotideCDSTotal: CDS (total) + - NucleotideCDSCoding: CDS (coding) + - NucleotideCDSProtein: CDS (with protein) + - NucleotideCDSWithoutProtein: CDS (without protein) + - NucleotideCRISPRArrays: CRISPR Arrays + - NucleotideExpectedFinalVersion: Expected Final Version + - NucleotideFeaturesAnnotated: Features Annotated + - NucleotideGenes: Genes + - NucleotideGenesTotal: Genes (total) + - NucleotideGenesCoding: Genes (coding) + - NucleotideGenesRNA: Genes (RNA) + - NucleotideGenomeCoverage: Genome Coverage + - NucleotideGenomeRepresentation: Genome Representation + - NucleotidencRNAs: ncRNAs + - NucleotidePseudoGenes: Pseudo Genes + - NucleotidePseudoGenesTotal: Pseudo Genes (total) + - NucleotidePseudoGenesAmbResidues: Pseudo Genes (ambiguous residues) + - NucleotidePseudoGenesFrameshifted: Pseudo Genes (frameshifted) + - NucleotidePseudoGenesIncomplete: Pseudo Genes (incomplete) + - NucleotidePseudoGenesInternalStop: Pseudo Genes (internal stop) + - NucleotidePseudoGenesMultipleProblems: Pseudo Genes (multiple problems) + - NucleotiderRNAs: rRNAs + - NucleotiderRNAsComplete: complete rRNAs + - NucleotiderRNAsPartial: partial rRNAs + - NucleotideSequencingTechnology: Sequencing Technology + - NucleotideRNAs: tRNAs + - NucleotideComment : NullValue + + - SRA : + - SRABioProjectAccession : STUDY, EXTERNAL_ID, namespace, BioProject + - SRABioSampleAccession : RUN_SET, RUN, Pool, EXTERNAL_ID, namespace, BioSample + - SRASampleAccession : SAMPLE_DESCRIPTOR, accession + - SRASampleName : SAMPLE, alias + - SRAExperimentAccession : EXPERIMENT, accession + - SRAExperimentName : EXPERIMENT, alias + - SRARunAccession : RUN, accession + - SRARunName : RUN, alias + - SRAIsPublic : RUN, is_public + - SRAStaticDataAvailable : RUN, static_data_available + - SRAStudyAcc : STUDY, accession + - SRAStudyName : STUDY_TITLE + - SRAStudyAbstract: STUDY_ABSTRACT + - SRAOrganismName : SAMPLE_NAME, SCIENTIFIC_NAME + - SRAOrganismTaxID : SAMPLE, TAXON_ID + - SRAClusterName : RUN, cluster_name + - SRAPlatform : PLATFORM + - SRAInstrumentModel : PLATFORM, INSTRUMENT_MODEL + - SRALibraryName : LIBRARY_NAME + - SRALibraryLayout : LIBRARY_LAYOUT + - SRALibrarySelection : LIBRARY_SELECTION + - SRALibrarySource : LIBRARY_SOURCE + - SRALibraryStrategy : LIBRARY_STRATEGY + - SRATotalBases : RUN_SET, RUN, total_bases + - SRATotalSize : RUN_SET, RUN, size + - SRATotalSpots : RUN_SET, RUN, total_spots + - SRAFileUrl : SRAFile, url + - SRAFileName : SRAFile, filename + - SRAFileSize : SRAFile, size + - SRAFileType : SRAFile, semantic_name + - SRARunPublishDate : RUN_SET, RUN, published + - SRACenterName : SUBMISSION, center_name + - SRAContactEmail : Organization, Contact, email + - SRALabName : SUBMISSION, lab_name + - SRASubmitterAccession : SUBMISSION, accession + - SRAComment : NullValue + + - Pubmed : + - PubmedPublishYear : PubDate, Year + - PubmedPubishMonth : PubDate, Month + - PubmedPublishDay : PubDate, Day + - PubmedEPublishDate : EPubDate + - PubmedPublishModel : Article, PubModel + - PubmedType : PublicationType + - PubmedJournalTitle : Journal, Title + - PubmedJournalAbbrev : Journal, ISOAbbreviation + - PubmedJournalISSN : Journal, ISSN + - PubmedArticleTitle : ArticleTitle + - PubmedAbstract : AbstractText + - PubmedVolume : Volume + - PubmedIssue : Issue + - PubmedPages : MedlinePgn + - PubmedDOI : ELocationID, EIdType, doi + - PubmedAuthorsLastName : AuthorList, LastName + - PubmedAuthorsForeName : AuthorList, ForeName + - PubmedAuthorsAffiliation : AuthorList, Affiliation + - PubmedLanguage : Language + - PubmedCitations : Reference, Citation + - PubmedRecordStatus : MedlineCitation, Status + - PubmedPubStatus : PublicationStatus + - PubmedComment : NullValue diff --git a/example/ncbimeta_test_annot.txt b/example/ncbimeta_test_annot.txt new file mode 100644 index 000000000..3dcd12696 --- /dev/null +++ b/example/ncbimeta_test_annot.txt @@ -0,0 +1,3 @@ +BioSampleAccession BioSampleHostDisease BioSampleCollectionDate +SAMN12991206 Plague 1984 +MissingAccession123 MissingDisease MissingDate diff --git a/main.nf b/main.nf index 816aa8c91..6146269ed 100644 --- a/main.nf +++ b/main.nf @@ -207,7 +207,6 @@ if (!params.skip_ncbimeta_db_create && params.ncbimeta_create){ tag "$ncbimeta_yaml" publishDir "${outdir}/ncbimeta_db/create", mode: 'copy' publishDir "${outdir}/ncbimeta_db/update/latest", mode: 'copy' - echo true ch_ncbimeta_yaml_create = Channel.fromPath(params.ncbimeta_create, checkIfExists: true) .ifEmpty { exit 1, "NCBImeta config file not found: ${params.ncbimeta-create}" } @@ -233,6 +232,9 @@ if (!params.skip_ncbimeta_db_create && params.ncbimeta_create){ if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){ + ncbimeta_sqlite_db_latest = "${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" + println "${ncbimeta_sqlite_db_latest}" + process ncbimeta_db_update{ /* Run NCBImeta queries to update, annotate, and join a previously created database. @@ -263,23 +265,27 @@ if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){ .ifEmpty { exit 1, "NCBImeta config file not found: ${params.ncbimeta_update}" } // If create and update not in same run (not fully reproducing finished pipeline) +"${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" if (!params.ncbimeta_create){ - ch_ncbimeta_sqlite_update = Channel.fromPath(params.ncbimeta_sqlite_db_latest, checkIfExists: true) - .ifEmpty { exit 1, "NCBImeta SQLite database not found: ${params.ncbimeta_sqlite_db_latest}" } + ch_ncbimeta_sqlite_update = Channel.fromPath("${ncbimeta_sqlite_db_latest}", checkIfExists: true) + .ifEmpty { exit 1, "NCBImeta SQLite database not found: ${ncbimeta_sqlite_db_latest}" } } // If an annotation file has been supplied, the annotation script will be run if (params.ncbimeta_annot){ - Channel - .fromPath(params.ncbimeta_annot, checkIfExists: true) - .ifEmpty { exit 1, "NCBImeta annotation file not found: ${params.ncbimeta_annot}" } - .collectFile(name: 'dummy_annot.txt', newLine: true, storeDir: "${workDir}") + ch_ncbimeta_annot = Channel + .fromPath(params.ncbimeta_annot, checkIfExists: true) + .ifEmpty { exit 1, "NCBImeta annotation file not found: ${params.ncbimeta_annot}" } + } + else{ + ch_ncbimeta_annot = Channel.empty() } // IO and conditional behavior input: file ncbimeta_yaml from ch_ncbimeta_yaml_update file ncbimeta_sqlite from ch_ncbimeta_sqlite_update + file ncbimeta_annot from ch_ncbimeta_annot output: file "${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" into ch_ncbimeta_sqlite_import file ncbimeta_yaml @@ -299,10 +305,8 @@ if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){ # Execute NCBImeta NCBImeta.py --config ${ncbimeta_yaml} # If annotation file supplied, run the annotation script - if [[ ${params.ncbimeta_annot} != "false" ]]; then - ANNOT_FILE=`basename ${params.ncbimeta_annot}` - mv ${workDir}/dummy_annot.txt `pwd`/\$ANNOT_FILE; - NCBImetaAnnotateReplace.py --table ${params.ncbimeta_annot_table} --annot ${params.ncbimeta_annot} --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db} + if [[ "${params.ncbimeta_annot}" != "false" ]]; then + NCBImetaAnnotateReplace.py --table ${params.ncbimeta_annot_table} --annot ${ncbimeta_annot} --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db} fi # Drop old or outdated join tables sqlite3 ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db} "DROP TABLE IF EXISTS MasterFirst" @@ -837,7 +841,6 @@ process eager{ // Other variables and config tag "$biosample_val" publishDir "${outdir}/eager", mode: 'copy' - echo true // If a custom tsv was supplied if (params.eager_tsv){ diff --git a/nextflow.config b/nextflow.config index 96a2488c5..6844bb91b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,7 +81,6 @@ params{ ncbimeta_output_dir = "output" ncbimeta_api_param = false ncbimeta_sqlite_db = "yersinia_pestis_db.sqlite" - ncbimeta_sqlite_db_latest = "${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" // NCBImetaAnnotate parameters ncbimeta_annot_table = "BioSample"