Skip to content

Commit

Permalink
proper channel support for ncbimeta_annot
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Sep 2, 2020
1 parent 87440bb commit 7c1e065
Show file tree
Hide file tree
Showing 13 changed files with 283 additions and 29 deletions.
34 changes: 18 additions & 16 deletions .github/workflows/pipeline_db.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ env:
CONDA_ENVS_PATH: "/home/runner/miniconda/envs:/usr/share/miniconda/envs"
CONDA_PKGS_DIRS: "/home/runner/miniconda/pkgs"
GH_RESOURCES: "--max_memory 6.GB --max_cpus 2"
SQLITE_CONFIG: "/home/runner/.nextflow/assets/ktmeaton/plague-phylogeography/config/ncbimeta.yaml"
SQLITE_TABLE: "BioSample"
SQLITE_BACKUP: "/home/runner/.nextflow/assets/ktmeaton/plague-phylogeography/results/ncbimeta_db/update/latest/yersinia_pestis_db_BioSample.tsv"
SQLITE_TEST_DB: "/home/runner/work/plague-phylogeography/plague-phylogeography/test/ncbimeta_db/update/latest/output/database/yersinia_pestis_db.sqlite"
#------------------------------------------------------------------------------#
# Workflow conditions
on:
Expand Down Expand Up @@ -74,18 +70,21 @@ jobs:
shell: bash -l {0}
run: scripts/install.sh ${{github.repository}} ${{ github.sha }}
#------------------------------------------------------------------------#
# Create the database
# Create the example database
- name: pipeline db create
shell: bash -l {0}
run: |
conda activate ${PHYLO_CONDA_ENV}
nextflow run -r ${{ github.sha }} ${{github.repository}} \
--ncbimeta_create ${SQLITE_CONFIG} \
--ncbimeta_create example/ncbimeta_test.yaml \
--ncbimeta_api_param "--force-pause-seconds 0 --email ${{secrets.EMAIL}} --api ${{secrets.NCBI_API}}" \
--ncbimeta_output_dir test \
--ncbimeta_sqlite_db test.sqlite \
--skip_sqlite_import \
--skip_reference_download \
--skip_outgroup_download \
--outdir test
--outdir test \
${GH_RESOURCES}
conda deactivate
#------------------------------------------------------------------------#
# Update the database
Expand All @@ -94,13 +93,16 @@ jobs:
run: |
conda activate ${PHYLO_CONDA_ENV}
nextflow run -r ${{ github.sha }} ${{github.repository}} \
--ncbimeta_update ${SQLITE_CONFIG} \
--ncbimeta_annot ${SQLITE_BACKUP} \
--ncbimeta_annot_table ${SQLITE_TABLE} \
--sqlite ${SQLITE_TEST_DB} \
--skip_sqlite_import \
--skip_reference_download \
--skip_outgroup_download \
--outdir test \
-resume
--ncbimeta_update example/ncbimeta_test.yaml \
--ncbimeta_sqlite_db test.sqlite \
--ncbimeta_output_dir test \
--ncbimeta_annot example/ncbimeta_test_annot.txt \
--ncbimeta_annot_table BioSample \
--sqlite test.sqlite \
--skip_sqlite_import \
--skip_reference_download \
--skip_outgroup_download \
--outdir test \
-resume \
${GH_RESOURCES}
conda deactivate
File renamed without changes.
247 changes: 247 additions & 0 deletions example/ncbimeta_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
# User Configuration Variables
OUTPUT_DIR : test
EMAIL : myusername@domain.com
API_KEY :
FORCE_PAUSE_SECONDS : 1
DATABASE : test.sqlite

# NCBI Tables to Query
TABLES :
- Assembly
- BioSample
- BioProject
- SRA
- Nucleotide
- Pubmed

# Query Terms to Use
SEARCH_TERMS :
- Assembly : (SAMN12991206[BioSample])
- BioProject : (PRJNA269675[BioProject IDs and Accessions])
- BioSample: (SAMN12991206[Accession])
- SRA : (SAMN12991206[BioSample])
- Nucleotide : (SAMN12991206[BioSample])
- Pubmed : (26634751[uid])

# Columns of the database (ie. metadata fields to retrieve)
TABLE_COLUMNS :

- Assembly :
- AssemblyAccession : AssemblyAccession
- AssemblyBioSampleAccession : BioSampleAccn
- AssemblyBioSampleID : BioSampleId
- AssemblyGenbankBioprojectAccession : GB_BioProjects, BioprojectAccn
- AssemblyGenbankID : GbUid
- AssemblyRefseqBioprojectAccession : RS_BioProjects, BioprojectAccn
- AssemblyRefSeqCategory : RefSeq_category
- AssemblyRefSeqID : RsUid
- AssemblyWGSAccession : WGS
- AssemblyInfraspecies : InfraspeciesList, Sub_value
- AssemblyIsolate : Isolate
- AssemblyOrganism : Organism
- AssemblySpeciesTaxonomicID : SpeciesTaxid
- AssemblySpeciesName : SpeciesName
- AssemblyTaxonomicID : Taxid
- AssemblyName : AssemblyName
- AssemblyStatus : AssemblyStatus
- AssemblyType : AssemblyType
- AssemblyCoverage : Coverage
- AssemblyChromosomes : Meta, Stat, category, chromosome_count
- AssemblyContigCount: Meta, Stat, category, contig_count
- AssemblyContigN50 : Meta, Stat, category, contig_n50
- AssemblyContigL50 : Meta, Stat, category, contig_l50
- AssemblyNonChromosomalReplicons : Meta, Stat, category, non_chromosome_replicon_count
- AssemblyReplicons : Meta, Stat, category, replicon_count
- AssemblyScaffolds : Meta, Stat, category, scaffold_count
- AssemblyScaffoldN50 : Meta, Stat, category, scaffold_n50
- AssemblyScaffoldL50 : Meta, Stat, category, scaffold_l50
- AssemblyTotalLength : Meta, Stat, category, total_length
- AssemblyUngappedLength : Meta, Stat, category, ungapped_length
- AssemblySubmitterOrganization : SubmitterOrganization
- AssemblySubmissionDate : SubmissionDate
- AssemblyReleaseDate : SeqReleaseDate
- AssemblyFTPAssemblyReport : FtpPath_Assembly_rpt
- AssemblyFTPGenbank : FtpPath_GenBank
- AssemblyFTPRefSeq : FtpPath_RefSeq
- AssemblyFTPStatsReport : FtpPath_Stats_rpt
- AssemblyComment : NullValue

- BioSample :
- BioSampleAccession: BioSample, accession
- BioSampleAccessionSecondary: NullValue
- BioSampleBioProjectAccession: XPATH, //Links/Link[@target='bioproject']/@label
- BioSampleSRAAccession: Id, db, SRA
- BioSampleTitle: Title
- BioSampleName: Id, db_label, Sample name
- BioSampleType: Attribute, harmonized_name, sample_type
- BioSamplePackage: Package
- BioSampleInfraspecies: Infraspecies
- BioSampleOrganism: Description, OrganismName
- BioSampleOrganismAlt: Description, Organism, taxonomy_name
- BioSampleSubSpecies: Attribute, harmonized_name, sub_species
- BioSampleStrain: Attribute, harmonized_name, strain
- BioSampleTaxonomyID: Organism, taxonomy_id
- BioSampleBiovar: Attribute, harmonized_name, biovar
- BioSampleSerovar: Attribute, harmonized_name, serovar
- BioSampleCollectionDate: Attribute, harmonized_name, collection_date
- BioSampleGeographicLocation: Attribute, harmonized_name, geo_loc_name
- BioSampleHost: Attribute, harmonized_name, host
- BioSampleHostDisease : Attribute, harmonized_name, host_disease
- BioSampleHostHealthState : Attribute, harmonized_name, host_health_state
- BioSampleIsolateNameAlias: Attribute, harmonized_name, isolate_name_alias
- BioSampleIsolationSource: Attribute, harmonized_name, isolation_source
- BioSampleLat : Attribute, attribute_name, Latitude
- BioSampleLatLon : Attribute, harmonized_name, lat_lon
- BioSampleLon : Attribute, attribute_name, Longitude
- BioSampleSubmissionDate: BioSample, submission_date
- BioSampleModificationDate: BioSample, last_update
- BioSamplePublicationDate: BioSample, publication_date
- BioSampleOrganization: Owner, Name
- BioSampleComment : NullValue

- BioProject :
- BioProjectAccession : ArchiveID, accession
- BioProjectDataType : ProjectDataTypeSet, DataType
- BioProjectDescription : ProjectDescr, Description
- BioProjectMethodType : Method, method_type
- BioProjectName : ProjectDescr, Name
- BioProjectTargetCapture : Target, capture
- BioProjectTargetMaterial : Target, material
- BioProjectTargetScope : Target, sample_scope
- BioProjectTitle : XPATH, //ProjectDescr/Title/text()
- BioProjectOrganismLabel : ProjectTypeSubmission, Label
- BioProjectOrganismStrain : ProjectTypeSubmission, Strain
- BioProjectOrganismTaxID : ProjectTypeSubmission, Organism, taxID
- BioProjectSpeciesTaxID : ProjectTypeSubmission, Organism, species
- BioProjectSupergroup : ProjectTypeSubmission, Supergroup
- BioProjectRegistrationDate : Submission, submitted
- BioProjectReleaseDate: ProjectReleaseDate
- BioProjectModificationDate: Submission, last_update
- BioProjectRelevanceMedical : Relevance, Medical
- BioProjectSubmitterOrganization : Organization, Name
- BioProjectPublished: Publication, status
- BioProjectDatePublished: Publication, date
- BioProjectPublicationID: Publication, id
- BioProjectPublicationDB: Publication, DbType
- BioProjectComment : NullValue

- Nucleotide :
- NucleotideAccession : GBSeq_primary-accession
- NucleotideAccessionVersion : GBSeq_accession-version
- NucleotideBioSampleAccession: XPATH, //GBXref[GBXref_dbname/text() = 'BioSample']/GBXref_id
- NucleotideBioProjectAccession : GBSeq_project
- NucleotideOrganism : GBSeq_organism
- NucleotideTaxonomy : GBSeq_taxonomy
- NucleotideDefinition : GBSeq_definition
- NucleotideDivision : GBSeq_division
- NucleotideReferenceJournal: GBSeq_references, GBReference_journal
- NucleotideReferenceTitle : GBSeq_references, GBReference_title
- NucleotideReferenceAuthors : GBSeq_references, GBReference_authors, GBAuthor
- NucleotideLength : GBSeq_length
- NucleotideMoleculeType : GBSeq_moltype
- NucleotideSeqDataName : GBSeq_alt-seq, GBAltSeqData_name
- NucleotideSource : GBSeq_source
- NucleotideStrandedness : GBSeq_strandedness
- NucleotideTopology : GBSeq_topology
- NucleotideCreateDate : GBSeq_create-date
- NucleotideUpdateDate : GBSeq_update-date
- NucleotideGenBankComment : GBSeq_comment
- NucleotideAnnotationDate: Annotation Date
- NucleotideAnnotationMethod: Annotation Method
- NucleotideAnnotationPipeline: Annotation Pipeline
- NucleotideAnnotationProvider: Annotation Provider
- NucleotideAnnotationSoftwarerevision: Annotation Software revision
- NucleotideAssemblyDate: Assembly Date
- NucleotideAssemblyMethod: Assembly Method
- NucleotideAssemblyName: Assembly Name
- NucleotideCDS: CDS
- NucleotideCDSTotal: CDS (total)
- NucleotideCDSCoding: CDS (coding)
- NucleotideCDSProtein: CDS (with protein)
- NucleotideCDSWithoutProtein: CDS (without protein)
- NucleotideCRISPRArrays: CRISPR Arrays
- NucleotideExpectedFinalVersion: Expected Final Version
- NucleotideFeaturesAnnotated: Features Annotated
- NucleotideGenes: Genes
- NucleotideGenesTotal: Genes (total)
- NucleotideGenesCoding: Genes (coding)
- NucleotideGenesRNA: Genes (RNA)
- NucleotideGenomeCoverage: Genome Coverage
- NucleotideGenomeRepresentation: Genome Representation
- NucleotidencRNAs: ncRNAs
- NucleotidePseudoGenes: Pseudo Genes
- NucleotidePseudoGenesTotal: Pseudo Genes (total)
- NucleotidePseudoGenesAmbResidues: Pseudo Genes (ambiguous residues)
- NucleotidePseudoGenesFrameshifted: Pseudo Genes (frameshifted)
- NucleotidePseudoGenesIncomplete: Pseudo Genes (incomplete)
- NucleotidePseudoGenesInternalStop: Pseudo Genes (internal stop)
- NucleotidePseudoGenesMultipleProblems: Pseudo Genes (multiple problems)
- NucleotiderRNAs: rRNAs
- NucleotiderRNAsComplete: complete rRNAs
- NucleotiderRNAsPartial: partial rRNAs
- NucleotideSequencingTechnology: Sequencing Technology
- NucleotideRNAs: tRNAs
- NucleotideComment : NullValue

- SRA :
- SRABioProjectAccession : STUDY, EXTERNAL_ID, namespace, BioProject
- SRABioSampleAccession : RUN_SET, RUN, Pool, EXTERNAL_ID, namespace, BioSample
- SRASampleAccession : SAMPLE_DESCRIPTOR, accession
- SRASampleName : SAMPLE, alias
- SRAExperimentAccession : EXPERIMENT, accession
- SRAExperimentName : EXPERIMENT, alias
- SRARunAccession : RUN, accession
- SRARunName : RUN, alias
- SRAIsPublic : RUN, is_public
- SRAStaticDataAvailable : RUN, static_data_available
- SRAStudyAcc : STUDY, accession
- SRAStudyName : STUDY_TITLE
- SRAStudyAbstract: STUDY_ABSTRACT
- SRAOrganismName : SAMPLE_NAME, SCIENTIFIC_NAME
- SRAOrganismTaxID : SAMPLE, TAXON_ID
- SRAClusterName : RUN, cluster_name
- SRAPlatform : PLATFORM
- SRAInstrumentModel : PLATFORM, INSTRUMENT_MODEL
- SRALibraryName : LIBRARY_NAME
- SRALibraryLayout : LIBRARY_LAYOUT
- SRALibrarySelection : LIBRARY_SELECTION
- SRALibrarySource : LIBRARY_SOURCE
- SRALibraryStrategy : LIBRARY_STRATEGY
- SRATotalBases : RUN_SET, RUN, total_bases
- SRATotalSize : RUN_SET, RUN, size
- SRATotalSpots : RUN_SET, RUN, total_spots
- SRAFileUrl : SRAFile, url
- SRAFileName : SRAFile, filename
- SRAFileSize : SRAFile, size
- SRAFileType : SRAFile, semantic_name
- SRARunPublishDate : RUN_SET, RUN, published
- SRACenterName : SUBMISSION, center_name
- SRAContactEmail : Organization, Contact, email
- SRALabName : SUBMISSION, lab_name
- SRASubmitterAccession : SUBMISSION, accession
- SRAComment : NullValue

- Pubmed :
- PubmedPublishYear : PubDate, Year
- PubmedPubishMonth : PubDate, Month
- PubmedPublishDay : PubDate, Day
- PubmedEPublishDate : EPubDate
- PubmedPublishModel : Article, PubModel
- PubmedType : PublicationType
- PubmedJournalTitle : Journal, Title
- PubmedJournalAbbrev : Journal, ISOAbbreviation
- PubmedJournalISSN : Journal, ISSN
- PubmedArticleTitle : ArticleTitle
- PubmedAbstract : AbstractText
- PubmedVolume : Volume
- PubmedIssue : Issue
- PubmedPages : MedlinePgn
- PubmedDOI : ELocationID, EIdType, doi
- PubmedAuthorsLastName : AuthorList, LastName
- PubmedAuthorsForeName : AuthorList, ForeName
- PubmedAuthorsAffiliation : AuthorList, Affiliation
- PubmedLanguage : Language
- PubmedCitations : Reference, Citation
- PubmedRecordStatus : MedlineCitation, Status
- PubmedPubStatus : PublicationStatus
- PubmedComment : NullValue
3 changes: 3 additions & 0 deletions example/ncbimeta_test_annot.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
BioSampleAccession BioSampleHostDisease BioSampleCollectionDate
SAMN12991206 Plague 1984
MissingAccession123 MissingDisease MissingDate
27 changes: 15 additions & 12 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ if (!params.skip_ncbimeta_db_create && params.ncbimeta_create){
tag "$ncbimeta_yaml"
publishDir "${outdir}/ncbimeta_db/create", mode: 'copy'
publishDir "${outdir}/ncbimeta_db/update/latest", mode: 'copy'
echo true

ch_ncbimeta_yaml_create = Channel.fromPath(params.ncbimeta_create, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta config file not found: ${params.ncbimeta-create}" }
Expand All @@ -233,6 +232,9 @@ if (!params.skip_ncbimeta_db_create && params.ncbimeta_create){

if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){

ncbimeta_sqlite_db_latest = "${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}"
println "${ncbimeta_sqlite_db_latest}"

process ncbimeta_db_update{
/*
Run NCBImeta queries to update, annotate, and join a previously created database.
Expand Down Expand Up @@ -263,23 +265,27 @@ if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){
.ifEmpty { exit 1, "NCBImeta config file not found: ${params.ncbimeta_update}" }

// If create and update not in same run (not fully reproducing finished pipeline)
"${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}"
if (!params.ncbimeta_create){
ch_ncbimeta_sqlite_update = Channel.fromPath(params.ncbimeta_sqlite_db_latest, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta SQLite database not found: ${params.ncbimeta_sqlite_db_latest}" }
ch_ncbimeta_sqlite_update = Channel.fromPath("${ncbimeta_sqlite_db_latest}", checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta SQLite database not found: ${ncbimeta_sqlite_db_latest}" }
}

// If an annotation file has been supplied, the annotation script will be run
if (params.ncbimeta_annot){
Channel
.fromPath(params.ncbimeta_annot, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta annotation file not found: ${params.ncbimeta_annot}" }
.collectFile(name: 'dummy_annot.txt', newLine: true, storeDir: "${workDir}")
ch_ncbimeta_annot = Channel
.fromPath(params.ncbimeta_annot, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta annotation file not found: ${params.ncbimeta_annot}" }
}
else{
ch_ncbimeta_annot = Channel.empty()
}

// IO and conditional behavior
input:
file ncbimeta_yaml from ch_ncbimeta_yaml_update
file ncbimeta_sqlite from ch_ncbimeta_sqlite_update
file ncbimeta_annot from ch_ncbimeta_annot
output:
file "${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" into ch_ncbimeta_sqlite_import
file ncbimeta_yaml
Expand All @@ -299,10 +305,8 @@ if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){
# Execute NCBImeta
NCBImeta.py --config ${ncbimeta_yaml}
# If annotation file supplied, run the annotation script
if [[ ${params.ncbimeta_annot} != "false" ]]; then
ANNOT_FILE=`basename ${params.ncbimeta_annot}`
mv ${workDir}/dummy_annot.txt `pwd`/\$ANNOT_FILE;
NCBImetaAnnotateReplace.py --table ${params.ncbimeta_annot_table} --annot ${params.ncbimeta_annot} --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}
if [[ "${params.ncbimeta_annot}" != "false" ]]; then
NCBImetaAnnotateReplace.py --table ${params.ncbimeta_annot_table} --annot ${ncbimeta_annot} --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}
fi
# Drop old or outdated join tables
sqlite3 ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db} "DROP TABLE IF EXISTS MasterFirst"
Expand Down Expand Up @@ -837,7 +841,6 @@ process eager{
// Other variables and config
tag "$biosample_val"
publishDir "${outdir}/eager", mode: 'copy'
echo true

// If a custom tsv was supplied
if (params.eager_tsv){
Expand Down
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ params{
ncbimeta_output_dir = "output"
ncbimeta_api_param = false
ncbimeta_sqlite_db = "yersinia_pestis_db.sqlite"
ncbimeta_sqlite_db_latest = "${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}"

// NCBImetaAnnotate parameters
ncbimeta_annot_table = "BioSample"
Expand Down

0 comments on commit 7c1e065

Please sign in to comment.