diff --git a/.travis.yml b/.travis.yml index a68d3ec5c..33c2fa8e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,69 +1,81 @@ language: perl -perl: - - "5.14" - - "5.26.2" -services: - - mysql -env: - - COVERALLS=true DB=mysql -addons: - apt: - update: true - packages: - - unzip - - sendmail - - graphviz - - emboss -before_install: - - git clone --depth 1 https://github.com/Ensembl/ensembl-git-tools.git - - export PATH=$PATH:$PWD/ensembl-git-tools/bin - - export ENSEMBL_BRANCH=master - - export SECONDARY_BRANCH=main - - echo "TRAVIS_BRANCH=$TRAVIS_BRANCH" - - if [[ $TRAVIS_BRANCH =~ ^release\/[0-9]+$ ]]; then export ENSEMBL_BRANCH=$TRAVIS_BRANCH; export SECONDARY_BRANCH=$TRAVIS_BRANCH; fi - - echo "ENSEMBL_BRANCH=$ENSEMBL_BRANCH" - - echo "SECONDARY_BRANCH=$SECONDARY_BRANCH" - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-test - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-compara - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-datacheck - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-variation - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-metadata - - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-funcgen - - git-ensembl --clone --branch master --secondary_branch main --depth 1 ensembl-hive - - git-ensembl --clone --branch master --secondary_branch main --depth 1 ensembl-orm - - git-ensembl --clone --branch master --secondary_branch main --depth 1 ensembl-taxonomy - - git clone --branch 1.9 --depth 1 https://github.com/samtools/htslib.git - - git clone --branch release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git - - cd htslib - - make - - export HTSLIB_DIR=$(pwd -P) - - cd .. -install: - - cpanm --sudo -v --installdeps --with-recommends --notest --cpanfile ensembl/cpanfile . - - cpanm --sudo -v --installdeps --notest --cpanfile ensembl-hive/cpanfile . - - cpanm --sudo -v --installdeps --notest --cpanfile ensembl-datacheck/cpanfile . - - export PERL5LIB=$PERL5LIB:$PWD/bioperl-live - - cpanm --sudo -v --installdeps --notest . - - cpanm --sudo -n Devel::Cover::Report::Coveralls - - cp travisci/MultiTestDB.conf.travisci modules/t/MultiTestDB.conf - - mysql -u root -h localhost -e 'GRANT ALL PRIVILEGES ON *.* TO "travis"@"%"' -script: - - ./travisci/harness.sh +os: linux jobs: include: - - language: python - python: 3.8 + - name: "Perl Job" + perl: "5.26.2" + services: + - mysql + env: + - COVERALLS=true DB=mysql + addons: + apt: + update: true + packages: + - unzip + - sendmail + - graphviz + - emboss + - libkyotocabinet-dev + before_install: + - git clone --depth 1 https://github.com/Ensembl/ensembl-git-tools.git + - export PATH=$PATH:$PWD/ensembl-git-tools/bin + - export ENSEMBL_BRANCH=master + - export SECONDARY_BRANCH=main + - echo "TRAVIS_BRANCH=$TRAVIS_BRANCH" + - if [[ $TRAVIS_BRANCH =~ ^release\/[0-9]+$ ]]; then export ENSEMBL_BRANCH=$TRAVIS_BRANCH; export SECONDARY_BRANCH=$TRAVIS_BRANCH; fi + - echo "ENSEMBL_BRANCH=$ENSEMBL_BRANCH" + - echo "SECONDARY_BRANCH=$SECONDARY_BRANCH" + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-test + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-compara + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-datacheck + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-variation + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-metadata + - git-ensembl --clone --branch $ENSEMBL_BRANCH --secondary_branch $SECONDARY_BRANCH --depth 1 ensembl-funcgen + - git-ensembl --clone --branch master --secondary_branch main --depth 1 ensembl-hive + - git-ensembl --clone --branch master --secondary_branch main --depth 1 ensembl-orm + - git-ensembl --clone --branch master --secondary_branch main --depth 1 ensembl-taxonomy + - git clone --branch 1.9 --depth 1 https://github.com/samtools/htslib.git + - git clone --branch release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git + - cd htslib + - make + - export HTSLIB_DIR=$(pwd -P) + - mysql -e "SET GLOBAL local_infile=1;" + - cd .. + install: + - cpanm --sudo -v --installdeps --with-recommends --notest --cpanfile ensembl/cpanfile . + - cpanm --sudo -v --installdeps --notest --cpanfile ensembl-hive/cpanfile . + - cpanm --sudo -v --installdeps --notest --cpanfile ensembl-datacheck/cpanfile . + - export PERL5LIB=$PERL5LIB:$PWD/bioperl-live + - cpanm travisci/kyotocabinet-perl-1.20.tar.gz + - cpanm --sudo -v --installdeps --notest . + - cpanm --sudo -n Devel::Cover::Report::Coveralls + - cp travisci/MultiTestDB.conf.travisci modules/t/MultiTestDB.conf + - mysql -u root -h localhost -e 'GRANT ALL PRIVILEGES ON *.* TO "travis"@"%"' + script: + - ./travisci/harness.sh + + - name: "Python Job" + language: python + python: + - "3.10" + - "3.11" + services: + - mysql + env: + - COVERALLS=true DB=mysql install: - - pip install -e . - pip install -r requirements-test.txt + - pip install -e . + before_script: + - mysql -e "SET GLOBAL local_infile=1;" script: - pytest src/python/test - notifications: email: on_success: always on_failure: always slack: - secure: BkrSPAkOM5aTOpeyO9vZnHdZ0LF1PLk0r2HtcXN2eTMyiHoGXkl6VUjdAL8EkzI4gunW2GProdSIjHpf60WdiEmKAulMdJRI+xyUbuxnY31mwiikS9HYwqmPBbMTf0Mh2pMBngZRFs+gaFZDUMTfLfp+8MQfU1R54yb6hPuVt5I= + secure: BkrSPAkOM5aTOpeyO9vZnHdZ0LF1PLk0r2HtcXN2eTMyiHoGXkl6VUjdAL8EkzI4gunW2GProdSIjHpf60WdiEmKAulMdJRI+xyUbuxnY31mwiikS9HYwqmPBbMTf0Mh2pMBngZRFs+gaFZDUMTfLfp+8MQfU1R54yb6hPuVt5I= \ No newline at end of file diff --git a/cpanfile b/cpanfile index ffd9edb0c..6939b8444 100644 --- a/cpanfile +++ b/cpanfile @@ -14,7 +14,7 @@ requires 'File::Slurp'; requires 'Log::Log4perl'; requires 'XML::Simple'; requires 'Time::Duration'; -requires 'Tie::LevelDB'; requires 'IO::Zlib'; requires 'File::Temp'; requires 'Fcntl'; +requires 'KyotoCabinet'; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateAlphaDB.pm b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateAlphaDB.pm index b7ec2f278..905ed4fca 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateAlphaDB.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateAlphaDB.pm @@ -33,12 +33,12 @@ This module prepares a DB with a mapping from Uniprot accession to related Alphafold data (Alphafold accession, protein start, end). The DB is created on - disk in LevelDB format. + disk in KyotoCabinet format. =head1 DESCRIPTION - We expect the file accession_ids.csv to be available - - We go through the file and build a LevelDB mapping the Uniprot accession to the Alphafold data + - We go through the file and build a DB mapping the Uniprot accession to the Alphafold data =cut @@ -49,7 +49,7 @@ use strict; use parent 'Bio::EnsEMBL::Production::Pipeline::Common::Base'; use Bio::EnsEMBL::Utils::Exception qw(throw info); -use Tie::LevelDB; +use KyotoCabinet; use File::Temp 'tempdir'; @@ -66,7 +66,7 @@ sub run { throw ("Data file not found: '$map_file' on host " . `hostname`) unless -f $map_file; - my $idx_dir = $self->param_required('alphafold_db_dir') . '/uniprot-to-alpha.leveldb'; + my $idx_dir = $self->param_required('alphafold_db_dir') . '/uniprot-to-alphafold'; if (-d $idx_dir) { system(qw(rm -rf), $idx_dir); } @@ -78,33 +78,42 @@ sub run { $copy_to = $idx_dir; $idx_dir = tempdir(DIR => '/dev/shm/'); } - - tie(my %idx, 'Tie::LevelDB', $idx_dir) - or die "Error trying to tie Tie::LevelDB $idx_dir: $!"; + + my $db = new KyotoCabinet::DB; + + # Set 4 GB mmap size + my $mapsize_gb = 4 << 30; + + # Open the DB + # Open as the exclusive writer, truncate if it exists, otherwise create the DB + # Open the database as a file hash DB, 600M buckets, 4GB mmap, linear option for + # hash collision handling. These are tuned for write speed and for approx. 300M entries. + # As with a regular Perl hash, a duplicate entry will overwrite the previous + # value. + $db->open("$idx_dir/uniprot-to-alphafold.kch#bnum=600000000#msiz=$mapsize_gb#opts=l", + $db->OWRITER | $db->OCREATE | $db->OTRUNCATE + ) or die "Error opening DB: " . $db->error(); my $map; open($map, '<', $map_file) or die "Opening map file $map_file failed: $!"; - # A line from accession_ids.csv looks like this: - # Uniprot accession, hit start, hit end, Alphafold accession, Alphafold version - # A0A2I1PIX0,1,200,AF-A0A2I1PIX0-F1,4 - # Currently, all entries in this file have a unique uniprot accession and - # have a hit starting at 1 - while (my $line = <$map>) { + chomp $line; + # A line from accession_ids.csv looks like this: + # Uniprot accession, hit start, hit end, Alphafold accession, Alphafold version + # A0A2I1PIX0,1,200,AF-A0A2I1PIX0-F1,4 + # Currently, all entries in this file have a unique uniprot accession and + # have a hit starting at 1 unless ($line =~ /^\w+,\d+,\d+,[\w_-]+,\d+$/) { - chomp $line; - warn "Data error. Line is not what we expect: '$line'"; - next; + die "Data error. Line is not what we expect: '$line'"; } my @x = split(",", $line, 2); - # This is the DB write operation. Tie::LevelDB will croak on errors (e.g. disk full) - $idx{$x[0]} = $x[1]; + # This is the DB write operation. + $db->set($x[0], $x[1]) or die "Error inserting data: " . $db->error(); } - close($map); - untie %idx; + $db->close() or die "Error closing DB: " . $db->error(); if ($copy_back) { system (qw(cp -r), $idx_dir, $copy_to); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateUniparcDB.pm b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateUniparcDB.pm index 58c01117a..da6064f2e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateUniparcDB.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/CreateUniparcDB.pm @@ -32,12 +32,12 @@ =head1 SYNOPSIS This module prepares a DB with a mapping from Uniparc accession to Uniprot - accession. The DB is created on disk in LevelDB format. + accession. The DB is created on disk in KyotoCabinet format. =head1 DESCRIPTION - We expect the file idmapping_selected.tab.gz to be available - - We go through the file and build a LevelDB mapping the Uniparc accessions to Uniprot accessions + - We go through the file and build a DB mapping the Uniparc accessions to Uniprot accessions =cut @@ -49,7 +49,7 @@ use strict; use parent 'Bio::EnsEMBL::Production::Pipeline::Common::Base'; use Bio::EnsEMBL::Utils::Exception qw(throw info); -use Tie::LevelDB; +use KyotoCabinet; use IO::Zlib; use File::Temp 'tempdir'; @@ -66,7 +66,7 @@ sub run { throw ("Data file not found: '$map_file' on host " . `hostname`) unless -f $map_file; - my $idx_dir = $self->param_required('uniparc_db_dir') . '/uniparc-to-uniprot.leveldb'; + my $idx_dir = $self->param_required('uniparc_db_dir') . '/uniparc-to-uniprot'; if (-d $idx_dir) { system(qw(rm -rf), $idx_dir); } @@ -79,8 +79,21 @@ sub run { $idx_dir = tempdir(DIR => '/dev/shm/'); } - tie(my %idx, 'Tie::LevelDB', $idx_dir) - or die "Error trying to tie Tie::LevelDB $idx_dir: $!"; + my $db = new KyotoCabinet::DB; + + # Set 4 GB mmap size + my $mapsize_gb = 4 << 30; + + # Open the DB + # Open as the exclusive writer, truncate if it exists, otherwise create the DB + # Open the database as a file hash DB, 600M buckets, 4GB mmap, linear option for + # hash collision handling. These are tuned for write speed and for approx. 300M entries. + # Uniparc has 251M entries at the moment. + # As with a regular Perl hash, a duplicate entry will overwrite the previous + # value. + $db->open("$idx_dir/uniparc-to-uniprot.kch#bnum=600000000#msiz=$mapsize_gb#opts=l", + $db->OWRITER | $db->OCREATE | $db->OTRUNCATE + ) or die "Error opening DB: " . $db->error(); my $map = new IO::Zlib; $map->open($map_file, 'rb') or die "Opening map file $map_file with IO::Zlib failed: $!"; @@ -90,22 +103,27 @@ sub run { # We pick out the Uniparc accession and Uniprot accession # index[10] (Uniparc): UPI00003B0FD4; index[0] (Uniprot): Q6GZX4 my $line; + while ($line = <$map>) { + chomp $line; unless ($line =~ /^\w+\t[[:print:]\t]+$/) { - warn "Data error: Line is not what we expect: '$line'"; - next; + die "Data error: Uniparc accession is not what we expect: '$line'"; } my @x = split("\t", $line, 12); unless ($x[10] and $x[10] =~ /^UPI\w+$/) { - warn "Data error: Uniparc accession is not what we expect: '$line'"; - next; + die "Data error: Uniparc accession is not what we expect: '$line'"; + } + # This is the DB write operation. + my $oldval; + if ($oldval = $db->get($x[10])) { + $db->set($x[10], "$oldval\t" . $x[0]) or die "Error inserting data: " . $db->error(); + } else { + $db->set($x[10], $x[0]) or die "Error inserting data: " . $db->error(); } - # This is the DB write operation. Tie::LevelDB will croak on errors (e.g. disk full) - $idx{$x[10]} = $x[0]; } $map->close; - untie %idx; + $db->close() or die "Error closing DB: " . $db->error(); if ($copy_back) { system (qw(cp -r), $idx_dir, $copy_to); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm index 059ac4a3e..0178fc21e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm @@ -164,7 +164,7 @@ sub run { -db => 'alphafold', -db_version => $alpha_version, -db_file => $self->param('db_dir') . '/accession_ids.csv', - -display_label => 'AlphaFold DB import', + -display_label => 'AFDB-ENSP mapping', -displayable => '1', -description => 'Protein features based on AlphaFold predictions, mapped with GIFTS or UniParc' ); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm b/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm index 49258e302..dff0d0adc 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/FileDump/Geneset_GFF3.pm @@ -266,7 +266,8 @@ sub Bio::EnsEMBL::Transcript::summary_as_hash { $summary{'transcript_support_level'} = $self->tsl if $self->tsl; my @tags; - push(@tags, 'basic') if $self->gencode_basic(); + push(@tags, 'gencode_basic') if $self->gencode_basic(); + push(@tags, 'gencode_primary') if $self->gencode_primary(); push(@tags, 'Ensembl_canonical') if $self->is_canonical(); # A transcript can have different types of MANE-related attributes (MANE_Select, MANE_Plus_Clinical) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm index 6d88ab72a..9efa955a6 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/GFF3/DumpFile.pm @@ -279,7 +279,8 @@ sub Bio::EnsEMBL::Transcript::summary_as_hash { $summary{'transcript_support_level'} = $self->tsl if $self->tsl; my @tags; - push(@tags, 'basic') if $self->gencode_basic(); + push(@tags, 'gencode_basic') if $self->gencode_basic(); + push(@tags, 'gencode_primary') if $self->gencode_primary(); push(@tags, 'Ensembl_canonical') if $self->is_canonical(); # A transcript can have different types of MANE-related attributes (MANE_Select, MANE_Plus_Clinical) diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm index 79358a04d..d95763721 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm @@ -383,7 +383,8 @@ feature for the position of this on the genome - cds_start_NF: the coding region start could not be confirmed - mRNA_end_NF: the mRNA end could not be confirmed - mRNA_start_NF: the mRNA start could not be confirmed. -- basic: the transcript is part of the gencode basic geneset +- gencode_basic: the transcript is part of the gencode basic geneset +- gencode_primary: the transcript is part of the gencode primary geneset Comments diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm index 28aa1ba06..d5d8e3a8a 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm @@ -218,7 +218,7 @@ sub all_hashes { } ## end foreach my $slice (@slices) for my $seq_type (keys %$batch) { - for my $attrib_table (keys $batch->{$seq_type}) { + for my $attrib_table (keys %{$batch->{$seq_type}}) { $attribute_adaptor->store_batch_on_Object($attrib_table, $batch->{$seq_type}->{$attrib_table}, 1000); } } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm index d8e8328da..a6738edc0 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm @@ -292,7 +292,10 @@ sub merge_xrefs { $obj->{$dbname} = []; } for my $ann ( @{ $subobj->{$dbname} } ) { - push $obj->{$dbname}, $self->copy_hash($ann); + if (ref($obj->{$dbname}) ne 'ARRAY') { + $obj->{$dbname} = []; + } + push @{ $obj->{$dbname} }, $self->copy_hash($ann); } } } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/AddFamilyMembers.pm b/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/AddFamilyMembers.pm index 37da4aabe..043c98ebc 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/AddFamilyMembers.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/AddFamilyMembers.pm @@ -78,11 +78,13 @@ sub run { # create a hash first though, which can then be processed # gene_id as key, then sets of protein-family pairs my $gene_families = {}; + # retrieve family data for canonical translations my $sql = qq/select t.gene_id, t.transcript_id, pf.hit_name from coord_system c join seq_region s using (coord_system_id) - join transcript t using (seq_region_id) - join translation tl using (transcript_id) + join gene g using(seq_region_id) + join transcript t on t.transcript_id = g.canonical_transcript_id + join translation tl on tl.translation_id = t.canonical_translation_id join protein_feature pf using (translation_id) join analysis pfa ON (pf.analysis_id=pfa.analysis_id) where pfa.logic_name in ($logic_names) @@ -104,37 +106,52 @@ sub run { my $family_members = {}; while ( my ( $gene_id, $hits ) = each %$gene_families ) { my $gene = $gene_adaptor->fetch_by_dbID($gene_id); - # create and store gene member - my $gene_member = - Bio::EnsEMBL::Compara::GeneMember->new_from_Gene( - -GENE => $gene, - -GENOME_DB => $genome_db, - -BIOTYPE_GROUP => $gene->get_Biotype->biotype_group() - ); - # If there are duplicate stable IDs, trap fatal error from compara - # method, so we can skip it and carry on with others. - eval { - $gene_member_dba->store($gene_member); - }; - if ($@) { - my ($msg) = $@ =~ /MSG:\s+([^\n]+)/m; - $self->warning('Duplicate stable ID: '.$msg); + + # retrieve or create-and-store gene_member from the current genome_db + my $gene_member = $gene_member_dba->fetch_by_stable_id_GenomeDB($gene->stable_id, + $genome_db); + my $existing_canonical; + if (defined $gene_member) { + $existing_canonical = $seq_member_dba->fetch_by_dbID( $gene_member->canonical_member_id ); } else { - for my $hit (@$hits) { - my $transcript = - $transcript_adaptor->fetch_by_dbID( $hit->[0] ); - my $seq_member = - Bio::EnsEMBL::Compara::SeqMember->new_from_Transcript( - -TRANSCRIPT => $transcript, - -TRANSLATE => 'yes', - -GENOME_DB => $genome_db - ); + $gene_member = + Bio::EnsEMBL::Compara::GeneMember->new_from_Gene( + -GENE => $gene, + -GENOME_DB => $genome_db, + -BIOTYPE_GROUP => $gene->get_Biotype->biotype_group() + ); + $gene_member_dba->store($gene_member); + } + + for my $hit (@$hits) { + my $transcript = + $transcript_adaptor->fetch_by_dbID( $hit->[0] ); + my $translation_stable_id = $transcript->translation->stable_id; + + if (defined $existing_canonical && $translation_stable_id ne $existing_canonical->stable_id) { + $self->warning(sprintf('skipping translation %s because stable ID does not match canonical member %s', + $translation_stable_id, $existing_canonical->stable_id)); + next; + } + + # retrieve or create-and-store seq_member from the current genome_db + my $seq_member = + $seq_member_dba->fetch_by_stable_id_GenomeDB($translation_stable_id, + $genome_db); + if (!defined $seq_member) { + $seq_member = + Bio::EnsEMBL::Compara::SeqMember->new_from_Transcript( + -TRANSCRIPT => $transcript, + -TRANSLATE => 'yes', + -GENOME_DB => $genome_db + ); # TODO store CDS too? $seq_member->gene_member_id( $gene_member->dbID ); $seq_member_dba->store($seq_member); $seq_member_dba->_set_member_as_canonical($seq_member); - push @{ $family_members->{ $hit->[1] } }, $seq_member->dbID(); } + + push @{ $family_members->{ $hit->[1] } }, $seq_member->dbID(); } } ## end while ( my ( $gene_id, $hits...)) print "Saving familes for ".$dba->species()."\n"; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/CreateFamilies.pm b/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/CreateFamilies.pm index e5cd4784c..f505d87fd 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/CreateFamilies.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/LoadFamily/CreateFamilies.pm @@ -66,7 +66,9 @@ sub run { my $compara_dba = $compara_dbas[0]; $self->param('compara_url', $compara_dba->url); my $schema_version = $compara_dba->get_MetaContainer->get_schema_version(); - $compara_dba->dbc()->sql_helper()->execute_update(-SQL=>'delete family.*,family_member.* from family left join family_member using (family_id)'); + my $compara_sql_helper = $compara_dba->dbc()->sql_helper(); + $compara_sql_helper->execute_update(-SQL=>'delete family.*,family_member.* from family left join family_member using (family_id)'); + # get compara my $genome_dba = $compara_dba->get_GenomeDBAdaptor(); @@ -143,25 +145,36 @@ sub run { print "Found ".scalar(keys(%{$families}))." familes\n"; # create and store MLSS - my $sso = Bio::EnsEMBL::Compara::SpeciesSet->new( - -GENOME_DBS => $genome_dbs, - -NAME => "collection-all_division", - ); - $sso->first_release($schema_version); - $compara_dba->get_SpeciesSetAdaptor()->store($sso); - - my $mlss = - Bio::EnsEMBL::Compara::MethodLinkSpeciesSet->new( - -method => - Bio::EnsEMBL::Compara::Method->new( - -type => 'FAMILY', - -class => 'Family.family', - -display_name => 'families' - ), - -species_set => $sso ); - $mlss->first_release($schema_version); - - $compara_dba->get_MethodLinkSpeciesSetAdaptor()->store($mlss); + my $sql = q/ + insert ignore into method_link (method_link_id, type, class, display_name) + values (301, 'FAMILY', 'Family.family', 'families')/; + $compara_sql_helper->execute_update( -SQL => $sql ); + my $method_dba = $compara_dba->get_MethodAdaptor(); + my $method = $method_dba->fetch_by_type('FAMILY'); + + my $species_set_dba = $compara_dba->get_SpeciesSetAdaptor(); + my $sso = $species_set_dba->fetch_by_GenomeDBs($genome_dbs); + if (!defined $sso) { + $sso = Bio::EnsEMBL::Compara::SpeciesSet->new( + -GENOME_DBS => $genome_dbs, + -NAME => "collection-all_division", + ); + $sso->first_release($schema_version); + $species_set_dba->store($sso); + } + + my $mlss_dba = $compara_dba->get_MethodLinkSpeciesSetAdaptor(); + my $mlss = $mlss_dba->fetch_by_method_link_type_GenomeDBs($method->type, $genome_dbs); + if (!defined $mlss) { + $mlss = + Bio::EnsEMBL::Compara::MethodLinkSpeciesSet->new( + -method => $method, + -name => 'all_division families', + -species_set => $sso ); + $mlss->first_release($schema_version); + $mlss_dba->store($mlss); + } + my $family_dba = $compara_dba->get_FamilyAdaptor(); while ( my ( $id, $name ) = each %$families ) { print "Storing family $id $name\n"; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm index 4599d8636..37cd8b09e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm @@ -59,7 +59,7 @@ sub write_output { my $compara_param = $self->param('compara'); my $cleanup_dir = $self->param('cleanup_dir'); - foreach my $pair (keys $sp_config) { + foreach my $pair (keys %{$sp_config}) { my $compara = $sp_config->{$pair}->{'compara'}; if (defined $compara_param && $compara ne $compara_param) { print STDERR "Skipping $compara\n"; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/AlphaDBImport_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/AlphaDBImport_conf.pm index b05614659..f16b1dfbe 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/AlphaDBImport_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/AlphaDBImport_conf.pm @@ -45,7 +45,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.6; +use Bio::EnsEMBL::Hive::Version 2.7; =head2 default_options diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BasePython_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BasePython_conf.pm index 2e51ffe33..54906c466 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BasePython_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BasePython_conf.pm @@ -25,7 +25,7 @@ use strict; use warnings; use Data::Dumper; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use base ('Bio::EnsEMBL::Hive::PipeConfig::EnsemblGeneric_conf'); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm index 157809e9d..4f0c0a558 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Hive::PipeConfig::EnsemblGeneric_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); @@ -61,7 +61,7 @@ sub default_options { 'species' => [], 'antispecies' => [], 'batch_size' => 50, - 'meta_filters' => {}, + 'meta_filters' => {}, 'update_dataset_status' => 'Processing', #updates dataset status in new metadata db #param to connect to old pipeline analysis name 'genome_factory_dynamic_output_flow' => { @@ -83,7 +83,7 @@ sub factory_analyses { -input_ids => [{}], -flow_into => { '1' => ['GenomeFactory'], - + }, -rc_name => 'default', }, @@ -98,12 +98,12 @@ sub factory_analyses { 'dataset_type' => $self->o('dataset_type'), 'dataset_status' => $self->o('dataset_status'), 'division' => $self->o('division'), - 'organism_group_type' => $self->o('organism_group_type'), + 'organism_group_type' => $self->o('organism_group_type'), 'species' => $self->o('species'), - 'antispecies' => $self->o('antispecies'), + 'antispecies' => $self->o('antispecies'), 'batch_size' => $self->o('batch_size'), - 'update_dataset_status' => $self->o('update_dataset_status'), - }, + 'update_dataset_status' => $self->o('update_dataset_status'), + }, -flow_into => $self->o('genome_factory_dynamic_output_flow'), }, @@ -111,7 +111,7 @@ sub factory_analyses { -logic_name => 'UpdateDatasetStatus', -module => 'ensembl.production.hive.HiveDatasetFactory', -language => 'python3', - -rc_name => 'default', + -rc_name => 'default', -parameters => { 'metadata_db_uri' => $self->o('metadata_db_uri'), 'update_dataset_status' => $self->o('update_dataset_status'), @@ -143,6 +143,8 @@ sub resource_classes { ); my %memory = ( + '100M' => '100', + '200M' => '200', '500M' => '500', '1GB' => '1000', '2GB' => '2000', diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BulkSQL_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BulkSQL_conf.pm index 3c2bab042..b0503c433 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BulkSQL_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/BulkSQL_conf.pm @@ -38,7 +38,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumGenerator_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumGenerator_conf.pm index c7198cc3d..781f60416 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumGenerator_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumGenerator_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumLoader_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumLoader_conf.pm index a7e110a93..977ae0568 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumLoader_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ChecksumLoader_conf.pm @@ -26,7 +26,7 @@ use warnings; use Data::Dumper; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/CoreStatistics_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/CoreStatistics_conf.pm index 236ad1365..1cf6a365a 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/CoreStatistics_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/CoreStatistics_conf.pm @@ -38,7 +38,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::ApiVersion qw/software_version/; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { @@ -130,7 +130,7 @@ sub pipeline_analyses { '3->A' => ['CheckStatistics_Chromosome'], 'A->1' => ['SpeciesFactory_All'], }, - -rc_name => '2GB', + -rc_name => '2GB_D', }, @@ -211,7 +211,7 @@ sub pipeline_analyses { }, -max_retry_count => 1, -hive_capacity => 50, - -rc_name => 'default_W' + -rc_name => '2GB_W' }, { @@ -262,7 +262,7 @@ sub pipeline_analyses { -max_retry_count => 1, -hive_capacity => 50, -flow_into => ['GeneGC_Datacheck'], - -rc_name => 'default_D', + -rc_name => '2GB_D', }, { @@ -327,7 +327,7 @@ sub pipeline_analyses { -max_retry_count => 1, -hive_capacity => 50, -batch_size => 10, - -rc_name => 'default', + -rc_name => '1GB', }, # { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DataChecksNonCore_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DataChecksNonCore_conf.pm index 71d3b7655..de8aaffd0 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DataChecksNonCore_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DataChecksNonCore_conf.pm @@ -31,7 +31,7 @@ use warnings; use base ('Bio::EnsEMBL::DataCheck::Pipeline::DbDataChecks_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DbCopy_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DbCopy_conf.pm index 11f63e43c..627e0ab05 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DbCopy_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DbCopy_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); sub default_options { @@ -39,6 +39,7 @@ sub default_options { # Database type factory groups => 1, group => [], + delete_group => [], # Named database factory dbname => [], @@ -58,7 +59,6 @@ sub default_options { # Drop databases from target, by default the same set that will be copied delete_db => 0, delete_release => $self->o('ensembl_release'), - delete_group => $self->o('group'), delete_dbname => $self->o('dbname'), delete_marts => $self->o('marts'), delete_compara => $self->o('compara'), @@ -200,7 +200,7 @@ sub pipeline_analyses { -max_retry_count => 1, -parameters => { ensembl_release => $self->o('delete_release'), - group => $self->o('delete_group'), + group => (ref($self->o('delete_group')) eq 'ARRAY' && @{$self->o('delete_group')}) ? $self->o('delete_group') : $self->o('group'), groups => $self->o('groups'), }, -flow_into => { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm index 89f38d607..02bbb9cc9 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_conf_strains.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_conf_strains.pm index 41a9ca74d..b0bd2e3a9 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_conf_strains.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_conf_strains.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::DumpOrtholog_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_eg_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_eg_conf.pm index 5cbdea73f..fd730fc3e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_eg_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_eg_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::DumpOrtholog_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; @@ -92,17 +92,17 @@ sub default_options { compara => 'metazoa', source => 'drosophila_melanogaster', species => [ - 'drosophila_ananassae', - 'drosophila_erecta', - 'drosophila_grimshawi', - 'drosophila_mojavensis', - 'drosophila_persimilis', - 'drosophila_pseudoobscura', - 'drosophila_sechellia', - 'drosophila_simulans', - 'drosophila_virilis', - 'drosophila_willistoni', - 'drosophila_yakuba' + 'drosophila_ananassae_gca017639315v2rs', + 'drosophila_erecta_gca003286155v2rs', + 'drosophila_grimshawi_gca018153295v1rs', + 'drosophila_mojavensis_gca018153725v1rs', + 'drosophila_persimilis_gca003286085v2rs', + 'drosophila_pseudoobscura_gca009870125v2rs', + 'drosophila_sechellia_gca004382195v2rs', + 'drosophila_simulans_gca016746395v2rs', + 'drosophila_virilis_gca003285735v2rs', + 'drosophila_willistoni_gca018902025v2rs', + 'drosophila_yakuba_gca016746365v2rs' ], antispecies => 'drosophila_melanogaster', division => 'EnsemblMetazoa', diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_ensembl_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_ensembl_conf.pm index a0e6ab38b..46f9513cf 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_ensembl_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpOrtholog_ensembl_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::DumpOrtholog_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpSpeciesForGOA_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpSpeciesForGOA_conf.pm index cd3b8e9a1..171494407 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpSpeciesForGOA_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpSpeciesForGOA_conf.pm @@ -34,7 +34,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EarlyDumps_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EarlyDumps_conf.pm index 0637f1043..9d809f563 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EarlyDumps_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EarlyDumps_conf.pm @@ -26,7 +26,7 @@ use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Production::Pipeline::PipeConfig::FileDumpMySQL_conf; use Bio::EnsEMBL::Production::Pipeline::PipeConfig::DumpCore_conf; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); @@ -56,7 +56,7 @@ sub default_options { metadata_base_dir => catdir($self->o('ENV', 'NOBACKUP_DIR'), $self->o('username'), 'genome_reports_'.$self->o('ensembl_version')), metadata_script => catdir($self->o('base_dir'), '/ensembl-metadata/misc_scripts/report_genomes.pl'), division_pattern_nonvert => '.fungi,.metazoa,.plants,.protists', - early_dump_base_path => catdir($self->o('ENV', 'NOBACKUP_DIR'), '/release_dumps/'), + early_dump_base_path => catdir($self->o('ENV', 'NOBACKUP_DIR'), '/release_dumps/', '/release-'.$self->o('ensembl_version').'/ftp_dumps/'), nfs_early_dump_path => '/nfs/production/flicek/ensembl/production/ensemblftp/', early_dumps_private_ftp => catdir('/nfs/ftp/private/ensembl/pre-releases','/release-'.$self->o('ensembl_version').'_'.$self->o('eg_version')), #flags to restrict division @@ -131,37 +131,14 @@ sub pipeline_analyses { -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd', -max_retry_count => 1, -parameters => { - early_dump_path_vert => catdir($self->o('early_dump_base_path'), '/release-'.$self->o('ensembl_version')), - nfs_early_dump_path_vert => catdir($self->o('nfs_early_dump_path'), '/release-'.$self->o('ensembl_version')), - early_dump_path_nonvert => catdir($self->o('early_dump_base_path'), '/release-'.$self->o('eg_version')), - nfs_early_dump_path_nonvert => catdir($self->o('nfs_early_dump_path'), '/release-'.$self->o('eg_version')), + nfs_early_dump_path_vert => catdir($self->o('nfs_early_dump_path'), '/release-'.$self->o('ensembl_version')), + nfs_early_dump_path_nonvert => catdir($self->o('nfs_early_dump_path'), '/release-'.$self->o('eg_version')), cmd => q{ - rsync -avW #early_dump_path_vert# #nfs_early_dump_path_vert# - rsync -avW #early_dump_path_nonvert# #nfs_early_dump_path_nonvert# - + rsync -avW --include={'vertebrates'} --exclude={'plants','protists', 'fungi', 'bacteria', 'metazoa'} #early_dump_base_path# #nfs_early_dump_path_vert# + rsync -avW --include={'plants','protists', 'fungi', 'bacteria', 'metazoa'} --exclude={'vertebrates'} #early_dump_base_path# #nfs_early_dump_path_nonvert# }, }, - -flow_into => { '1' => 'CopyToPublicFtp' }, - - - }, - { - -logic_name => 'CopyToPublicFtp', - -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd', - -max_retry_count => 1, - -parameters => { - nfs_early_dump_path_vert => catdir($self->o('nfs_early_dump_path'), '/release-'.$self->o('ensembl_version')), - nfs_early_dump_path_nonvert => catdir($self->o('nfs_early_dump_path'), '/release-'.$self->o('eg_version')), - early_dumps_private_ftp => $self->o('early_dumps_private_ftp'), - cmd => q{ - rsync -avW #nfs_early_dump_path_vert#/verterates/ #early_dumps_private_ftp# - rsync -avW #nfs_early_dump_path_nonvert#/ #early_dumps_private_ftp# - - }, - }, - - -flow_into => { '1' => 'Email' }, - + -flow_into => { '1' => 'Email' }, }, { -logic_name => 'MetaDataReport', diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblSearchDumps_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblSearchDumps_conf.pm index 8910c2531..7bd38728e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblSearchDumps_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblSearchDumps_conf.pm @@ -27,7 +27,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblThoasDumps_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblThoasDumps_conf.pm index e5153532e..61361da4c 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblThoasDumps_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/EnsemblThoasDumps_conf.pm @@ -27,7 +27,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FactoryTest_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FactoryTest_conf.pm index 5c8590d06..af53f5fd7 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FactoryTest_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FactoryTest_conf.pm @@ -26,7 +26,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FileDumpMySQL_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FileDumpMySQL_conf.pm index a6f5f3229..aff1a5ba6 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FileDumpMySQL_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/FileDumpMySQL_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GPAD_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GPAD_conf.pm index d5346d9f7..0f594487d 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GPAD_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GPAD_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneAutoComplete_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneAutoComplete_conf.pm index 59eeae96e..b31a3f0eb 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneAutoComplete_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneAutoComplete_conf.pm @@ -37,7 +37,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneTreeHighlighting_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneTreeHighlighting_conf.pm index 075faf886..24d418067 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneTreeHighlighting_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GeneTreeHighlighting_conf.pm @@ -32,7 +32,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GrantMySQL_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GrantMySQL_conf.pm index a81a2c03a..0a9d9fe4c 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GrantMySQL_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/GrantMySQL_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadAppris_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadAppris_conf.pm index 15c3a2486..793ec0d50 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadAppris_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadAppris_conf.pm @@ -32,7 +32,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadRefget_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadRefget_conf.pm index faab42885..c10601583 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadRefget_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadRefget_conf.pm @@ -20,7 +20,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadTSL_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadTSL_conf.pm index 670594711..184d5a82f 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadTSL_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/LoadTSL_conf.pm @@ -32,7 +32,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/MVP_XrefProcess_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/MVP_XrefProcess_conf.pm index 7c800054b..9826b1478 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/MVP_XrefProcess_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/MVP_XrefProcess_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/OLSLoad_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/OLSLoad_conf.pm index 0b83cf992..66549db1e 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/OLSLoad_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/OLSLoad_conf.pm @@ -31,7 +31,7 @@ use warnings FATAL => 'all'; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProductionDBSync_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProductionDBSync_conf.pm index 3b58fe283..002f8f271 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProductionDBSync_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProductionDBSync_conf.pm @@ -32,7 +32,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm index a140af5a7..55b1cea2e 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); @@ -75,7 +75,7 @@ sub default_options { interpro_file => 'names.dat', interpro2go_file => 'interpro2go', - uniparc_file => 'upidump.lis', + uniparc_file => 'upidump.lis.gz', mapping_file => 'idmapping_selected.tab.gz', # Files are retrieved and stored locally with the same name. @@ -228,11 +228,35 @@ sub default_options { ipscan_lookup => 0, }, { - logic_name => 'seg', - db => 'Seg', + db => 'Phobius', + ipscan_lookup => 1, + ipscan_name => 'Phobius', + ipscan_xml => 'PHOBIUS', + logic_name => 'phobius', + program => 'InterProScan', }, - ], - + { + db => 'SignalP_GRAM_POSITIVE', + ipscan_lookup => 1, + ipscan_name => 'SignalP_GRAM_POSITIVE', + ipscan_xml => 'SIGNALP_GRAM_POSITIVE', + logic_name => 'signalp_gram_positive', + program => 'InterProScan', + }, + { + db => 'SignalP_GRAM_NEGATIVE', + ipscan_lookup => 1, + ipscan_name => 'SignalP_GRAM_NEGATIVE', + ipscan_xml => 'SIGNALP_GRAM_NEGATIVE', + logic_name => 'signalp_gram_negative', + program => 'InterProScan', + }, + #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html) + { + logic_name => 'seg', + db => 'Seg', + }, + ], xref_analyses => [ { @@ -336,17 +360,19 @@ sub pipeline_analyses { -logic_name => 'InterProScanVersionCheck', -module => 'Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::InterProScanVersionCheck', -max_retry_count => 0, -# -input_ids => [ {} ], - -parameters => { - interproscan_path => $self->o('interproscan_path'), - interproscan_version => $self->o('interproscan_version'), - local_computation => $self->o('local_computation'), - }, - -flow_into => { - '3->A' => [ 'FetchFiles' ], - 'A->3' => [ 'AnnotateProteinFeatures' ], - }, - }, +# -input_ids => [ {} ], + -parameters => { + interproscan_path => $self->o('interproscan_path'), + interproscan_version => $self->o('interproscan_version'), + local_computation => $self->o('local_computation'), + }, + -flow_into => { + '3->A' => ['FetchFiles'], + 'A->3' => ['AnnotateProteinFeatures'], + }, + -rc_name => '2GB', + + }, { -logic_name => 'FetchFiles', @@ -421,14 +447,16 @@ sub pipeline_analyses { -rc_name => 'dm', }, - { - -logic_name => 'LoadUniParc', - -module => 'Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc', - -max_retry_count => 1, - -parameters => { - uniparc_file_local => $self->o('uniparc_file_local'), - }, - }, + { + -logic_name => 'LoadUniParc', + -module => 'Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc', + -max_retry_count => 1, + -parameters => { + uniparc_file_local => $self->o('uniparc_file_local'), + }, + -rc_name => '2GB_W', + + }, { -logic_name => 'LoadUniProt', @@ -485,20 +513,23 @@ sub pipeline_analyses { ], output_file => catdir('#pipeline_dir#', '#dbname#', 'pre_pipeline_bkp.sql.gz'), }, - -flow_into => [ 'AnalysisConfiguration' ], + -rc_name => '1GB', + -flow_into => [ 'AnalysisConfiguration' ], }, - { - -logic_name => 'AnalysisConfiguration', - -module => 'Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::AnalysisConfiguration', - -max_retry_count => 0, - -parameters => { - protein_feature_analyses => $self->o('protein_feature_analyses'), - check_interpro_db_version => $self->o('check_interpro_db_version'), - run_seg => $self->o('run_seg'), - xref_analyses => $self->o('xref_analyses'), - }, - -flow_into => { + { + -logic_name => 'AnalysisConfiguration', + -module => 'Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::AnalysisConfiguration', + -max_retry_count => 0, + -parameters => { + protein_feature_analyses => $self->o('protein_feature_analyses'), + check_interpro_db_version => $self->o('check_interpro_db_version'), + run_seg => $self->o('run_seg'), + xref_analyses => $self->o('xref_analyses'), + }, + -rc_name => '2GB', + + -flow_into => { '2->A' => [ 'AnalysisSetup' ], 'A->3' => [ 'RemoveOrphans' ], } @@ -535,7 +566,8 @@ sub pipeline_analyses { 'WHERE ox.object_xref_id IS NULL', ] }, - -flow_into => [ 'DeleteInterPro' ] + -rc_name => 'default_D', + -flow_into => [ 'DeleteInterPro' ] }, { @@ -573,7 +605,8 @@ sub pipeline_analyses { -max_retry_count => 1, -analysis_capacity => 20, -parameters => {}, - -flow_into => { + -rc_name => '1GB', + -flow_into => { '2' => [ 'DumpProteome' ], } }, @@ -588,7 +621,7 @@ sub pipeline_analyses { header_style => 'dbID', overwrite => 1, }, - -rc_name => '4GB', + -rc_name => '4GB_W', -flow_into => { '-1' => [ 'DumpProteome_HighMem' ], '1' => WHEN('#run_seg#' => @@ -631,7 +664,8 @@ sub pipeline_analyses { max_files_per_directory => $self->o('max_files_per_directory'), max_dirs_per_directory => $self->o('max_dirs_per_directory'), }, - -flow_into => { + -rc_name => '1GB', + -flow_into => { '2' => [ 'RunSeg' ], }, }, @@ -646,7 +680,8 @@ sub pipeline_analyses { { cmd => $self->o('seg_exe') . ' #split_file# ' . $self->o('seg_params') . ' > #split_file#.seg.txt', }, - -flow_into => [ 'StoreSegFeatures' ], + -rc_name => '1GB', + -flow_into => [ 'StoreSegFeatures' ], }, { @@ -673,7 +708,8 @@ sub pipeline_analyses { uniparc_logic_name => $self->o('uniparc_logic_name'), uniprot_logic_name => $self->o('uniprot_logic_name'), }, - -flow_into => { + -rc_name => '1GB_D', + -flow_into => { '3' => [ 'SplitChecksumFile' ], '4' => [ 'SplitNoChecksumFile' ], }, @@ -764,7 +800,7 @@ sub pipeline_analyses { interproscan_applications => '#interproscan_nolookup_applications#', run_interproscan => $self->o('run_interproscan'), }, - -rc_name => '4GB_8CPU', + -rc_name => '16GB_8CPU', -flow_into => { '3' => [ 'StoreProteinFeatures' ], '-1' => [ 'InterProScanNoLookup_HighMem' ], @@ -801,7 +837,7 @@ sub pipeline_analyses { interproscan_applications => '#interproscan_local_applications#', run_interproscan => $self->o('run_interproscan'), }, - -rc_name => '4GB_8CPU', + -rc_name => '16GB_8CPU', -flow_into => { '3' => [ 'StoreProteinFeatures' ], '0' => [ 'InterProScanLocal_HighMem' ], @@ -835,7 +871,8 @@ sub pipeline_analyses { -parameters => { analyses => $self->o('protein_feature_analyses') }, - -flow_into => { + -rc_name => '1GB_D', + -flow_into => { '-1' => [ 'StoreProteinFeatures_HighMem' ], }, }, @@ -861,7 +898,8 @@ sub pipeline_analyses { interpro2go_file => $self->o('interpro2go_file_local'), logic_name => $self->o('interpro2go_logic_name') }, - -flow_into => [ 'StoreInterProXrefs' ], + -rc_name => '1GB', + -flow_into => [ 'StoreInterProXrefs' ], }, { @@ -892,7 +930,8 @@ sub pipeline_analyses { history_file => $self->o('history_file'), failures_fatal => 1, }, - -flow_into => WHEN('#email_report#' => [ 'EmailReport' ]), + -rc_name => '2GB_D', + -flow_into => WHEN('#email_report#' => [ 'EmailReport' ]), }, { @@ -906,14 +945,23 @@ sub pipeline_analyses { }, }, - { - -logic_name => 'TidyScratch', - -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd', - -max_retry_count => 1, - -parameters => { - cmd => 'rm -rf #scratch_dir#', - }, + { + -logic_name => 'TidyScratch', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd', + -max_retry_count => 1, + -parameters => { + cmd => 'rm -rf #scratch_dir#', + }, + -flow_into => 'CleanTables', + }, + + { + -logic_name => 'CleanTables', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::SqlCmd', + -parameters => { + sql => 'DROP table uniparc; drop table uniprot', }, + }, ]; } @@ -921,12 +969,13 @@ sub pipeline_analyses { sub resource_classes { my ($self) = @_; - return { - %{$self->SUPER::resource_classes}, - '4GB_8CPU' => { 'LSF' => '-q ' . $self->o('production_queue') . ' -n 8 -M 4000 -R "rusage[mem=4000]"' }, - '16GB_8CPU' => { 'LSF' => '-q ' . $self->o('production_queue') . ' -n 8 -M 16000 -R "rusage[mem=16000]"' }, - '32GB_8CPU' => { 'LSF' => '-q ' . $self->o('production_queue') . ' -n 8 -M 32000 -R "rusage[mem=32000]"' }, - } + return { + %{$self->SUPER::resource_classes}, + '16GB_8CPU' => { 'LSF' => '-q ' . $self->o('production_queue') . ' -n 8 -M 16000 -R "rusage[mem=16000]"' , + 'SLURM' => ' --partition=standard --time=1-00:00:00 --mem=16000m -n 8 -N 1'}, + '32GB_8CPU' => { 'LSF' => '-q ' . $self->o('production_queue') . ' -n 8 -M 32000 -R "rusage[mem=32000]"' , + 'SLURM' => ' --partition=standard --time=1-00:00:00 --mem=32000m -n 8 -N 1'}, + } } 1; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/RNAGeneXref_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/RNAGeneXref_conf.pm index 40c6c9b78..6d8fa3fc8 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/RNAGeneXref_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/RNAGeneXref_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleDataCopy_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleDataCopy_conf.pm index d02de7472..82778ac57 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleDataCopy_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleDataCopy_conf.pm @@ -41,7 +41,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleData_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleData_conf.pm index 79a860494..110f532dd 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleData_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SampleData_conf.pm @@ -43,7 +43,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SearchDumps_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SearchDumps_conf.pm index 6b16ce2e4..adb3bcbbd 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SearchDumps_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/SearchDumps_conf.pm @@ -37,7 +37,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/StableIDs_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/StableIDs_conf.pm index 4df737ac6..dc30908ef 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/StableIDs_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/StableIDs_conf.pm @@ -38,7 +38,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TaxonomyInfoCore_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TaxonomyInfoCore_conf.pm index cb010d7ef..ec6141542 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TaxonomyInfoCore_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TaxonomyInfoCore_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use File::Spec::Functions qw(catdir); sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TranscriptomeDomains_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TranscriptomeDomains_conf.pm index b9426d30c..fdad0bde8 100755 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TranscriptomeDomains_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/TranscriptomeDomains_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::ProteinFeatures_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/UpdatePackedStatus_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/UpdatePackedStatus_conf.pm index 8b02ae00d..b0a6c1460 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/UpdatePackedStatus_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/UpdatePackedStatus_conf.pm @@ -26,7 +26,7 @@ use warnings; use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/VariationStatistics_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/VariationStatistics_conf.pm index 0cb424cbf..a709f5b44 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/VariationStatistics_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/VariationStatistics_conf.pm @@ -37,7 +37,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::CoreStatistics_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefDownload_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefDownload_conf.pm index 4c17d22d3..6bbe3fa04 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefDownload_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefDownload_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { @@ -76,7 +76,7 @@ sub pipeline_analyses { '2->A' => 'download_source', 'A->1' => 'schedule_cleanup' }, - -rc_name => 'small' + -rc_name => 'default' }, { -logic_name => 'download_source', @@ -85,7 +85,30 @@ sub pipeline_analyses { -parameters => { base_path => $self->o('base_path') }, - -rc_name => 'dm', + -rc_name => 'dm_D', + -max_retry_count => 3, + -flow_into => { '-1' => 'download_source_32'} + }, + { + -logic_name => 'download_source_32', + -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::DownloadSource', + -comment => 'Downloads the source files and stores then in -base_path.', + -parameters => { + base_path => $self->o('base_path') + }, + -rc_name => 'dm32_D', + -max_retry_count => 3, + -flow_into => { '-1' => 'download_source_MAX'} + }, + #THIS STEP IS THE RESULT OF A BUG AND SHOULD BE REMOVED AS SOON AS THE PIPELINE IS FIXED + { + -logic_name => 'download_source_MAX', + -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::DownloadSource', + -comment => 'Downloads the source files and stores then in -base_path.', + -parameters => { + base_path => $self->o('base_path') + }, + -rc_name => 'dmMAX_D', -max_retry_count => 3 }, { @@ -102,7 +125,7 @@ sub pipeline_analyses { '4->A' => 'cleanup_uniprot', 'A->1' => 'schedule_pre_parse' }, - -rc_name => 'small' + -rc_name => 'default' }, { -logic_name => 'checksum', @@ -112,7 +135,7 @@ sub pipeline_analyses { base_path => $self->o('base_path'), skip_download => $self->o('skip_download') }, - -rc_name => 'normal' + -rc_name => '100M_W' }, { -logic_name => 'cleanup_refseq_dna', @@ -124,7 +147,7 @@ sub pipeline_analyses { skip_download => $self->o('skip_download'), clean_dir => $self->o('clean_dir') }, - -rc_name => 'small' + -rc_name => '100M_D' }, { -logic_name => 'cleanup_refseq_peptide', @@ -136,7 +159,7 @@ sub pipeline_analyses { skip_download => $self->o('skip_download'), clean_dir => $self->o('clean_dir') }, - -rc_name => 'small' + -rc_name => 'default' }, { -logic_name => 'cleanup_uniprot', @@ -148,7 +171,7 @@ sub pipeline_analyses { skip_download => $self->o('skip_download'), clean_dir => $self->o('clean_dir') }, - -rc_name => 'small' + -rc_name => '200M_D' }, { -logic_name => 'schedule_pre_parse', @@ -167,13 +190,23 @@ sub pipeline_analyses { '4' => 'pre_parse_source_tertiary', '-1' => 'notify_by_email' }, - -rc_name => 'small' + -rc_name => 'default' }, + { -logic_name => 'pre_parse_source', -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::PreParse', -comment => 'Store data for faster species parsing', - -rc_name => '2GB', + -rc_name => '2GB_D', + -hive_capacity => 100, + -can_be_empty => 1, + -flow_into => {'-1' => 'pre_parse_source_long_HM'} + }, + { + -logic_name => 'pre_parse_source_long_HM', + -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::PreParse', + -comment => 'Store data for faster species parsing', + -rc_name => '4GB_W', -hive_capacity => 100, -can_be_empty => 1, }, @@ -181,7 +214,7 @@ sub pipeline_analyses { -logic_name => 'pre_parse_source_dependent', -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::PreParse', -comment => 'Store data for faster species parsing', - -rc_name => '2GB', + -rc_name => '16GB_D', -hive_capacity => 100, -can_be_empty => 1, -wait_for => 'pre_parse_source' @@ -190,7 +223,7 @@ sub pipeline_analyses { -logic_name => 'pre_parse_source_tertiary', -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::PreParse', -comment => 'Store data for faster species parsing', - -rc_name => '2GB', + -rc_name => '2GB_D', -hive_capacity => 100, -can_be_empty => 1, -wait_for => 'pre_parse_source_dependent', @@ -208,21 +241,11 @@ sub pipeline_analyses { skip_preparse => $self->o('skip_preparse') }, -wait_for => 'pre_parse_source_tertiary', - -rc_name => 'small' + -rc_name => 'default' } ]; } -sub resource_classes { - my ($self) = @_; - - return { - %{$self->SUPER::resource_classes}, - 'small' => { 'LSF' => '-q production -M 200 -R "rusage[mem=200]"' }, # Change 'production' to 'production-rh74' if running on noah - 'normal' => { 'LSF' => '-q production -M 1000 -R "rusage[mem=1000]"' } - }; -} - sub pipeline_wide_parameters { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm index c737c2a77..d478fff24 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm @@ -24,7 +24,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; sub default_options { @@ -76,7 +76,7 @@ sub pipeline_analyses { '1->A' => 'schedule_species', 'A->1' => 'EmailAdvisoryXrefReport' }, - -rc_name => 'small', + -rc_name => 'default', }, { -logic_name => 'schedule_species', @@ -93,7 +93,7 @@ sub pipeline_analyses { '2->A' => 'schedule_source', 'A->2' => 'schedule_dependent_source' }, - -rc_name => 'small', + -rc_name => 'default', }, { -logic_name => 'schedule_source', @@ -112,7 +112,7 @@ sub pipeline_analyses { xref_pass => $self->o('xref_pass'), }, -flow_into => { '2' => 'parse_source' }, - -rc_name => 'small', + -rc_name => '1GB_D', -analysis_capacity => 10, }, { @@ -133,7 +133,7 @@ sub pipeline_analyses { '2->A' => 'parse_source', 'A->1' => 'schedule_tertiary_source', }, - -rc_name => 'small', + -rc_name => '4GB_D', }, { -logic_name => 'schedule_tertiary_source', @@ -153,12 +153,12 @@ sub pipeline_analyses { '2->A' => 'parse_source', 'A->1' => 'dump_ensembl', }, - -rc_name => 'small', + -rc_name => 'default', }, { -logic_name => 'parse_source', -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::ParseSource', - -rc_name => 'large', + -rc_name => '16GB_D', -hive_capacity => 300, -analysis_capacity => 50, -batch_size => 30, @@ -173,11 +173,12 @@ sub pipeline_analyses { base_path => $self->o('base_path'), release => $self->o('release') }, + -max_retry_count => 0, -flow_into => { '2->A' => 'dump_xref', 'A->1' => 'schedule_mapping' }, - -rc_name => 'mem', + -rc_name => '16GB_D', }, { -logic_name => 'dump_xref', @@ -187,8 +188,9 @@ sub pipeline_analyses { release => $self->o('release'), config_file => $self->o('config_file') }, + -max_retry_count => 0, -flow_into => { 2 => 'align_factory' }, - -rc_name => 'normal', + -rc_name => '1GB', }, { -logic_name => 'align_factory', @@ -197,7 +199,7 @@ sub pipeline_analyses { base_path => $self->o('base_path'), release => $self->o('release')}, -flow_into => { 2 => 'align' }, - -rc_name => 'small', + -rc_name => 'default', }, { -logic_name => 'align', @@ -205,7 +207,7 @@ sub pipeline_analyses { -parameters => { base_path => $self->o('base_path') }, - -rc_name => 'large', + -rc_name => '16GB_D', -hive_capacity => 300, -analysis_capacity => 300, -batch_size => 5, @@ -220,9 +222,9 @@ sub pipeline_analyses { }, -flow_into => { '2->A' => ['direct_xrefs', 'rnacentral_mapping'], - 'A->1' => 'mapping' + 'A->1' => 'object_xref_check' }, - -rc_name => 'small', + -rc_name => '1GB', }, { -logic_name => 'direct_xrefs', @@ -232,7 +234,7 @@ sub pipeline_analyses { release => $self->o('release') }, -flow_into => { 1 => 'process_alignment' }, - -rc_name => 'normal', + -rc_name => '1GB_D', -analysis_capacity => 30 }, { @@ -242,7 +244,7 @@ sub pipeline_analyses { base_path => $self->o('base_path'), release => $self->o('release') }, - -rc_name => 'normal', + -rc_name => '1GB_D', -analysis_capacity => 30 }, { @@ -253,7 +255,7 @@ sub pipeline_analyses { release => $self->o('release') }, -flow_into => { 1 => 'uniparc_mapping' }, - -rc_name => 'normal', + -rc_name => 'default', -hive_capacity => 300, -analysis_capacity => 30 }, @@ -265,7 +267,7 @@ sub pipeline_analyses { release => $self->o('release') }, -flow_into => { 1 => 'coordinate_mapping' }, - -rc_name => 'normal', + -rc_name => '1GB', -hive_capacity => 300, -analysis_capacity => 30 }, @@ -276,9 +278,21 @@ sub pipeline_analyses { base_path => $self->o('base_path'), release => $self->o('release') }, - -rc_name => 'mem', + -rc_name => '16GB', -analysis_capacity => 30 }, + { + -logic_name => 'object_xref_check', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::SqlHealthcheck', + -parameters => { + db_conn => '#xref_url#', + description => 'Check that the object_xref table has rows', + query => 'SELECT object_xref_id FROM object_xref', + expected_size => '> 0' + }, + -flow_into => { 1 => 'mapping' }, + -rc_name => 'default', + }, { -logic_name => 'mapping', -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::Mapping', @@ -290,25 +304,26 @@ sub pipeline_analyses { '1->A' => 'RunXrefCriticalDatacheck', 'A->1' => 'RunXrefAdvisoryDatacheck' }, - -rc_name => 'mem', + -rc_name => '16GB_D', -analysis_capacity => 30, }, { - -logic_name => 'RunXrefCriticalDatacheck', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks', - -max_retry_count => 1, - -analysis_capacity => 10, - -batch_size => 10, - -parameters => { - datacheck_names => ['ForeignKeys'], - datacheck_groups => ['xref_mapping'], - datacheck_types => ['critical'], - registry_file => $self->o('registry'), - config_file => $self->o('dc_config_file'), - history_file => $self->o('history_file'), - old_server_uri => $self->o('old_server_uri'), - failures_fatal => 1, - }, + -logic_name => 'RunXrefCriticalDatacheck', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks', + -max_retry_count => 1, + -analysis_capacity => 10, + -batch_size => 10, + -parameters => { + datacheck_names => [ 'ForeignKeys' ], + datacheck_groups => [ 'xref_mapping' ], + datacheck_types => [ 'critical' ], + registry_file => $self->o('registry'), + config_file => $self->o('dc_config_file'), + history_file => $self->o('history_file'), + old_server_uri => $self->o('old_server_uri'), + failures_fatal => 1, + }, + -rc_name => '1GB', }, { -logic_name => 'RunXrefAdvisoryDatacheck', @@ -325,12 +340,14 @@ sub pipeline_analyses { old_server_uri => $self->o('old_server_uri'), failures_fatal => 0, }, - -flow_into => { 4 => 'AdvisoryXrefReport' } + -flow_into => { 4 => 'AdvisoryXrefReport' }, + -rc_name => '1GB', + }, { -logic_name => 'AdvisoryXrefReport', -module => 'Bio::EnsEMBL::Production::Pipeline::Xrefs::AdvisoryXrefReport', - -rc_name => 'small' + -rc_name => 'default' }, { -logic_name => 'EmailAdvisoryXrefReport', @@ -340,7 +357,7 @@ sub pipeline_analyses { pipeline_name => $self->o('pipeline_name'), base_path => $self->o('base_path') }, - -rc_name => 'small', + -rc_name => 'default', -flow_into => { 1 => 'notify_by_email' } }, { @@ -350,22 +367,11 @@ sub pipeline_analyses { email => $self->o('email'), pipeline_name => $self->o('pipeline_name') }, - -rc_name => 'small' + -rc_name => 'default' } ]; } -sub resource_classes { - my ($self) = @_; - - return { - %{$self->SUPER::resource_classes}, - 'small' => { 'LSF' => '-q production -M 200 -R "rusage[mem=200]"' }, - 'normal' => { 'LSF' => '-q production -M 500 -R "rusage[mem=500]"' }, - 'mem' => { 'LSF' => '-q production -M 3000 -R "rusage[mem=3000]"' }, - 'large' => { 'LSF' => '-q production -M 10000 -R "rusage[mem=10000]"' }, - } -} sub pipeline_wide_parameters { my ($self) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Xref_update_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Xref_update_conf.pm index b72c803c2..69947d710 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Xref_update_conf.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Xref_update_conf.pm @@ -25,7 +25,7 @@ use warnings; use base ('Bio::EnsEMBL::Production::Pipeline::PipeConfig::Base_conf'); use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; -use Bio::EnsEMBL::Hive::Version 2.5; +use Bio::EnsEMBL::Hive::Version 2.7; sub default_options { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm index 40eddb1b4..444b7a2bb 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm @@ -21,16 +21,27 @@ package Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc; use strict; use warnings; - +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use File::Basename; - use base ('Bio::EnsEMBL::Production::Pipeline::Common::Base'); sub run { my ($self) = @_; my $uniparc_file = $self->param_required('uniparc_file_local'); + if (-e $uniparc_file) { + + #check if uniparc file is compressed + if ($uniparc_file =~ /\.gz$/){ + my $uniparc_file_decompress = $uniparc_file; + $uniparc_file_decompress =~ s/\.gz$//; + gunzip $uniparc_file => $uniparc_file_decompress or $self->throw("gunzip failed: $GunzipError"); + #delete compressed file .gz + unlink $uniparc_file or $self->throw("unable to delete $uniparc_file: $!"); + $uniparc_file = $uniparc_file_decompress; + } + my $dbh = $self->hive_dbh; my $sql = "LOAD DATA LOCAL INFILE '$uniparc_file' INTO TABLE uniparc FIELDS TERMINATED BY ' '"; $dbh->do($sql) or self->throw($dbh->errstr); @@ -41,9 +52,14 @@ sub run { my $index_2 = 'ALTER TABLE uniparc ADD KEY md5sum_idx (md5sum) USING HASH'; $dbh->do($index_2) or self->throw($dbh->errstr); + #delete upidump file from pipeline direcotry after loading into hive db + unlink $uniparc_file or $self->throw("unable to delete $uniparc_file: $!"); + } else { $self->throw("Checksum file '$uniparc_file' does not exist"); } + + } 1; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/index.sql b/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/index.sql index a807f370a..d05766336 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/index.sql +++ b/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/index.sql @@ -13,5 +13,4 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -CREATE INDEX stable_id_db_type ON stable_id_lookup(stable_id, db_type, object_type); -CREATE INDEX stable_id_object_type ON stable_id_lookup(stable_id, object_type); +CREATE INDEX stable_id_db_type ON stable_id_lookup(stable_id, object_type, db_type); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/table.sql b/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/table.sql index 4ae7ac6ec..6319894a5 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/table.sql +++ b/modules/Bio/EnsEMBL/Production/Pipeline/StableID/sql/table.sql @@ -14,15 +14,12 @@ -- limitations under the License. CREATE TABLE archive_id_lookup ( - archive_id VARCHAR(128) NOT NULL, + archive_id VARCHAR(100) NOT NULL, species_id INTEGER UNSIGNED NOT NULL, - db_type VARCHAR(255) NOT NULL, - object_type VARCHAR(255) NOT NULL, - - UNIQUE INDEX archive_id_lookup_idx (archive_id, species_id, db_type, object_type), - KEY archive_id_db_type (archive_id, db_type, object_type), - KEY archive_id_object_type (archive_id, object_type) + db_type VARCHAR(20) NOT NULL, + object_type VARCHAR(20) NOT NULL, + UNIQUE INDEX archive_id_lookup_idx (archive_id, object_type, db_type, species_id) ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; CREATE TABLE meta ( @@ -48,9 +45,9 @@ CREATE TABLE species ( ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; CREATE TABLE stable_id_lookup ( - stable_id VARCHAR(128) NOT NULL, + stable_id VARCHAR(100) NOT NULL, species_id INTEGER UNSIGNED NOT NULL, - db_type VARCHAR(255) NOT NULL, - object_type VARCHAR(255) NOT NULL + db_type VARCHAR(20) NOT NULL, + object_type VARCHAR(20) NOT NULL ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm index dde019e2f..b713fa3b8 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFile.pm @@ -55,7 +55,6 @@ return; sub run { my ($self) = @_; - $self->info( "Starting tsv dump for " . $self->param('species')); $self->_write_tsv(); $self->_create_README(); @@ -115,7 +114,7 @@ sub _write_tsv { }#transcript }#gene }#slice - close $fh; + close $fh; $self->core_dbc()->disconnect_if_idle(); return; } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm index 3e80b6fef..75768cb7c 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileEna.pm @@ -107,8 +107,8 @@ sub _write_tsv { if(!defined $row->[5]){ $row->[5] = $self->_find_contig($ta, $contig_ids, $row->[3] ); } elsif( !defined $row->[6] && defined $row->[4]){ - $row->[6] = $cds2acc->{$row->[4]}; - } + $row->[6] = $cds2acc->{$row->[4]}; + } if (defined $row->[5]) { $row->[5] =~ s/\.[0-9]+$//; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm index d63261607..ba7cde4b0 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileMetadata.pm @@ -70,9 +70,9 @@ return; sub run { my ($self) = @_; - + $self->_make_karyotype_file(); - + return; } @@ -81,7 +81,7 @@ sub _make_karyotype_file { my $sp = $self->param_required('species'); my $sa = Bio::EnsEMBL::Registry->get_adaptor($sp, 'core', 'slice'); - + if(! $sa) { $self->info("Cannot continue as we cannot find a core:slice DBAdaptor for %s", $sp); return; @@ -92,7 +92,7 @@ sub _make_karyotype_file { my $slices = $sa->fetch_all_karyotype(); # If we don't have any slices (ie. chromosomes), don't make the file return unless(scalar(@$slices)); - + my $file = $self->_generate_file_name(); work_with_file($file, 'w', sub { diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileXref.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileXref.pm index fffe56ded..df33c0d76 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileXref.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/TSV/DumpFileXref.pm @@ -66,7 +66,9 @@ sub run { return; } +sub write { +} ############# ##SUBROUTINES ############# @@ -74,7 +76,7 @@ sub _write_tsv { my ($self) = @_; my $out_file = $self->_generate_file_name(); - my $header = $self->_build_headers(); + my $header = $self->_build_headers(); open my $fh, '>', $out_file or die "cannot open $out_file for writing!"; print $fh join ("\t", @$header); @@ -112,9 +114,9 @@ sub _write_tsv { my $xref_db = $dbentry->dbname(); my $xref_info_type= $dbentry->info_type(); - if ($dbentry->isa('Bio::EnsEMBL::IdentityXref')){ - $src_identity = $dbentry->ensembl_identity(); - $xref_identity = $dbentry->xref_identity(); + if ($dbentry->isa('Bio::EnsEMBL::IdentityXref')){ + $src_identity = $dbentry->ensembl_identity(); + $xref_identity = $dbentry->xref_identity(); } $linkage_type = join(' ', @{$dbentry->get_all_linkage_types()})if($dbentry->isa('Bio::EnsEMBL::OntologyXref')); print $fh "$g_id\t$tr_id\t$tl_id\t$xref_id\t$xref_db\t$xref_info_type\t$src_identity\t$xref_identity\t$linkage_type\n"; @@ -122,8 +124,9 @@ sub _write_tsv { }#dbentry }#transcript }#gene - }#slice - close $fh; + }#slice + close $fh; + if ($xrefs_exist == 1) { $self->dataflow_output_id( @@ -132,7 +135,6 @@ sub _write_tsv { # If we have no xrefs, delete the file (which will just have a header). unlink $out_file or die "failed to delete $out_file!"; } - return; } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/TaxonomyUpdate/QueryMetadata.pm b/modules/Bio/EnsEMBL/Production/Pipeline/TaxonomyUpdate/QueryMetadata.pm index 3438dd571..025cc3a65 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/TaxonomyUpdate/QueryMetadata.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/TaxonomyUpdate/QueryMetadata.pm @@ -82,7 +82,24 @@ sub _meta { $self->warning('Querying Taxonomy'); #my $tdba = Bio::EnsEMBL::Registry->get_DBAdaptor( "multi", "taxonomy" ); my $taxonomy = $self->_taxonomy( $tdba, $metadata->{'species.taxonomy_id'}); - $metadata->{'species.classification'} = $taxonomy; + + if (scalar(@{$taxonomy}) == 0) { + my $dbname = $self->param('dbname'); + my $species = $self->param('species'); + my $taxon_id = $metadata->{'species.taxonomy_id'}; + my $msg = "Cannot retrieve taxonomy classification for species $species with taxonomy_id $taxon_id in database $dbname;"; + + my $primary_taxon_id = $self->_fetch_primary_taxon_id($tdba, $taxon_id); + if (defined $primary_taxon_id) { + $msg = $msg + ." taxon_id $taxon_id has been merged into primary taxon_id $primary_taxon_id." + ." Would you kindly set the 'species.taxonomy_id' meta entry for this species to the" + ." primary taxon_id and ensure the change is propagated to other relevant databases ?"; + } + throw $msg; + } + + $metadata->{'species.classification'} = $taxonomy; $self->warning('Updating meta'); foreach my $key ( keys %{$metadata} ) { my $array = wrap_array( $metadata->{$key} ); @@ -150,6 +167,34 @@ SQL } +sub _fetch_primary_taxon_id { + my ( $self, $tdba, $taxon_id ) = @_; + $self->warning('Querying taxonomy to fetch primary taxon_id'); + my $sql = <<'SQL'; +select taxon_id +from ncbi_taxa_name +where name = ? +and name_class = 'merged_taxon_id' +SQL + my $dbc = $tdba->dbc(); + my $res = $dbc->sql_helper()->execute_simple( + -SQL => $sql, + -PARAMS => [ $taxon_id ] + ); + $self->debug( 'Result is [%s]', join( q{, }, @{$res} ) ); + my $result_count = scalar(@{$res}); + + my $primary_taxon_id; + if ($result_count == 1) { + $primary_taxon_id = $res->[0]; + } elsif ($result_count > 1) { + throw "Expected at most 1 primary taxon_id for $taxon_id but got $result_count"; + } + + return $primary_taxon_id; +} + + sub _remove_deprecated { my ($self, $meta_container) = @_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm index b3233ea9d..ef4868850 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm @@ -84,10 +84,18 @@ sub run { $exe =~ s/\n//g; my $command_string = sprintf ("%s --showalignment FALSE --showvulgar FALSE --ryo '%s' --gappedextension FALSE --model 'affine:local' %s --subopt no --query %s --target %s --querychunktotal %s --querychunkid %s", $exe, $ryo, $method, $source, $target, $max_chunks, $chunk); my $output = `$command_string`; - my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments - while (my $hit = shift @hits) { - print $fh $hit . "\n"; + if ($? == 0) { + my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments + + while (my $hit = shift @hits) { + print $fh $hit . "\n"; + } + } else { + my $job = $self->input_job(); + $job->adaptor()->db()->get_LogMessageAdaptor()->store_job_message($job->dbID(), $output, 'WORKER_ERROR'); + + throw("Exonerate failed with exit_code: $?\n"); } $fh->close(); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupRefseqPeptide.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupRefseqPeptide.pm index 313773f1f..3dd8e2aff 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupRefseqPeptide.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupRefseqPeptide.pm @@ -89,7 +89,7 @@ sub run { while (<$in_fh>) { if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}Protein/) { $skip_data = 1; - } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^\s{5}CDS/) { + } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^\s{5}CDS/ || $_ =~ /^ORIGIN/) { $skip_data = 0; } if (!$skip_data) {print $out_fh $_;} diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupUniprot.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupUniprot.pm index 522a74080..df744e250 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupUniprot.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/CleanupUniprot.pm @@ -116,7 +116,7 @@ sub run { $_ =~ s/\nCC\s{3}.*//g; # Remove comments $_ =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment $_ =~ s/\nFT\s{3}.*//g; # Remove feature coordinates - $_ =~ s/\nDR\s{3}($sources_to_remove);.*\n//g; # Remove sources skipped at processing + $_ =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing # Added lines that we do need into output print $out_fh $_; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/DumpXref.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/DumpXref.pm index 3a184d14c..635e6fd8d 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/DumpXref.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/DumpXref.pm @@ -99,7 +99,7 @@ sub run { # Ambiguous peptides must be cleaned out to protect Exonerate from J,O and U codes $row[1] = uc($row[1]); $row[1] =~ s/(.{60})/$1\n/g; - if ($seq_type eq 'pep') { $row[1] =~ tr/JOU/X/ } + if ($seq_type eq 'peptide') { $row[1] =~ tr/JOU/X/ } print $DH ">".$row[0]."\n".$row[1]."\n"; } $mapping_source_sth->execute($source_id, $seq_type); diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ParseSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ParseSource.pm index 068bf107f..e9d807beb 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ParseSource.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ParseSource.pm @@ -79,13 +79,17 @@ sub run { file => $file_name}) ; $self->cleanup_DBAdaptor($db); } else { - $failure += $xref_run->run( { source_id => $source_id, - species_id => $species_id, - species => $species, - rel_file => $release_file, - dbi => $dbi, - xref_source => $source_dbi, - files => [@files] }) ; + my $run_params = { + source_id => $source_id, + species_id => $species_id, + species => $species, + rel_file => $release_file, + dbi => $dbi, + xref_source => $source_dbi, + files => [@files] + }; + $run_params->{hgnc_file} = $self->param('hgnc_file') if ($parser =~ /^UniProt/); + $failure += $xref_run->run( $run_params ) ; } if ($failure) { die; } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Parser/UniProtDatabaseParser.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Parser/UniProtDatabaseParser.pm index e47bcc2de..e8db14138 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Parser/UniProtDatabaseParser.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Parser/UniProtDatabaseParser.pm @@ -242,7 +242,7 @@ sub run { # Make sure these are still lines with Name or Synonyms if (($gn !~ /^GN/ || $gn !~ /Name=/) && $gn !~ /Synonyms=/) { last; } - if ($gn =~ / Name=([A-Za-z0-9_\-\.\s]+)/s) { #/s for multi-line entries ; is the delimiter + if ($gn =~ / Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter # Example line # GN Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348}; my $name = $1; diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm index 48f20cbc3..fd56a7d5a 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm @@ -83,12 +83,16 @@ sub run { # Retrieve list of sources from versioning database my ($source_user, $source_pass, $source_host, $source_port, $source_db) = $self->parse_url($source_url); my $dbi = $self->get_dbi($source_host, $source_port, $source_user, $source_pass, $source_db); - my $select_source_sth = $dbi->prepare("SELECT distinct name, parser, uri, clean_uri, index_uri, count_seen, preparse, revision FROM source s, version v WHERE s.source_id = v.source_id"); + my $select_source_sth = $dbi->prepare("SELECT distinct name, parser, uri, clean_uri, index_uri, count_seen, preparse, revision FROM source s, version v WHERE s.source_id = v.source_id order by name"); my ($name, $parser, $file_name, $clean_file_name, $dataflow_params, $db, $priority, $release_file); $select_source_sth->execute(); $select_source_sth->bind_columns(\$name, \$parser, \$file_name, \$clean_file_name, \$db, \$priority, \$preparse, \$release_file); + my $hgnc_path; + while ($select_source_sth->fetch()) { + $hgnc_path = $file_name if ($name eq 'HGNC'); + if (defined $db && $db eq 'checksum') { next; } if ($priority != $order_priority) { next; } if (defined $clean_file_name) { $file_name = $clean_file_name; } @@ -126,11 +130,34 @@ sub run { $self->dataflow_output_id($dataflow_params, 2); } else { # Create list of files - my @list_files = `ls $file_name`; + opendir(my $dir_handle, $file_name); + my @temp_list_files = readdir($dir_handle); + closedir($dir_handle); + + my @list_files; + foreach my $file (@temp_list_files) { + next if ($file =~ /^\./); + push(@list_files, $file_name . "/" . $file); + } if ($preparse) { @list_files = $preparse; } + + # For Uniprot and Refseq, files might have been split by species + if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) { + my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein'))); + my @species_list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id); + if (scalar(@species_list_files) > 0) { + @list_files = @species_list_files; + } + } + + # For ZFIN, we only need 1 job (parser handles all the files) + if ($name eq 'ZFIN_ID') { + @list_files = $list_files[0]; + } + foreach my $file (@list_files) { $file =~ s/\n//; - $file = $file_name . "/" . $file; + if (!-f $file) { next; } if (defined $release_file and $file eq $release_file) { next; } $dataflow_params = { @@ -144,6 +171,10 @@ sub run { priority => $priority, file_name => $file }; + if ($name =~ /^Uniprot/) { + my @hgnc_files = glob( $hgnc_path . '/*' ); + $dataflow_params->{hgnc_file} = $hgnc_files[0]; + } $self->dataflow_output_id($dataflow_params, 2); } } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json index 140082a34..f34e6f977 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json @@ -16,7 +16,7 @@ { "name" : "UniParc", "parser" : "ChecksumParser", - "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis", + "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz", "db" : "checksum", "priority" : 1 }, @@ -194,7 +194,7 @@ { "name" : "miRBase", "parser" : "miRBaseParser", - "file" : "https://mirbase.org/ftp/CURRENT/miRNA.dat.gz", + "file" : "https://mirbase.org/download/miRNA.dat", "method" : "--bestn 1", "query_cutoff" : 90, "target_cutoff" : 90, @@ -203,7 +203,7 @@ { "name" : "HGNC", "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", "db" : "ccds", "priority" : 3 } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json index 7ef812874..07c48e29a 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json @@ -6,17 +6,10 @@ "db" : "core", "priority" : 1 }, - { - "name" : "CCDS", - "parser" : "CCDSParser", - "file" : "Database", - "db" : "ccds", - "priority" : 1 - }, { "name" : "UniParc", "parser" : "ChecksumParser", - "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis", + "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz", "db" : "checksum", "priority" : 1 }, @@ -197,19 +190,19 @@ "name" : "ZFIN_ID", "parser" : "ZFINParser", "file" : "http://zfin.org/data_transfer/Downloads/uniprot.txt", - "priority" : 2 + "priority" : 3 }, { "name" : "ZFIN_ID", "parser" : "ZFINParser", "file" : "http://zfin.org/data_transfer/Downloads/aliases.txt", - "priority" : 2 + "priority" : 3 }, { "name" : "ZFIN_ID", "parser" : "ZFINParser", - "file" : "http://zfin.org/data_transfer/Downloads/gene_seq.txt", - "priority" : 1 + "file" : "https://zfin.org/downloads/ensembl_1_to_1.txt", + "priority" : 3 }, { "name" : "ZFIN_desc", @@ -226,13 +219,13 @@ { "name" : "Xenbase", "parser" : "XenopusJamboreeParser", - "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt", + "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", "priority" : 1 }, { "name" : "miRBase", "parser" : "miRBaseParser", - "file" : "https://mirbase.org/ftp/CURRENT/miRNA.dat.gz", + "file" : "https://mirbase.org/download/miRNA.dat", "method" : "--bestn 1", "query_cutoff" : 90, "target_cutoff" : 90, @@ -241,7 +234,7 @@ { "name" : "HGNC", "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", "db" : "ccds", "priority" : 3 } diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json index 17fa985f1..0070a69d4 100644 --- a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json +++ b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json @@ -16,7 +16,7 @@ { "name" : "UniParc", "parser" : "ChecksumParser", - "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis", + "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz", "db" : "checksum", "priority" : 1 }, @@ -254,13 +254,13 @@ { "name" : "Xenbase", "parser" : "XenopusJamboreeParser", - "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt", + "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt", "priority" : 1 }, { "name" : "miRBase", "parser" : "miRBaseParser", - "file" : "https://mirbase.org/ftp/CURRENT/miRNA.dat.gz", + "file" : "https://mirbase.org/download/miRNA.dat", "method" : "--bestn 1", "query_cutoff" : 90, "target_cutoff" : 90, @@ -269,7 +269,7 @@ { "name" : "HGNC", "parser" : "HGNCParser", - "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", "db" : "ccds", "priority" : 3 } diff --git a/modules/t/test-genome-DBs/homo_sapiens/core/meta.txt b/modules/t/test-genome-DBs/homo_sapiens/core/meta.txt index 811bab016..1840fcdb1 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/core/meta.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/core/meta.txt @@ -1,5 +1,5 @@ 1 \N schema_type core -2 \N schema_version 111 +2 \N schema_version 114 3 \N patch patch_98_99_a.sql|schema_version 2124 1 xref.timestamp 2013-07-22 11:20:10 4 \N patch patch_52_53_c.sql|identity_xref_rename @@ -261,3 +261,9 @@ 2217 \N patch patch_109_110_b.sql|Add IS_PAR relationship to link X- and Y-PAR genes 2218 \N patch patch_109_110_c.sql|Allow gene id to belong to multiple alt allele groups 2219 \N patch patch_110_111_a.sql|schema_version +2220 \N patch patch_111_112_a.sql|schema_version +2221 \N patch patch_111_112_b.sql|Allow meta_value to be null +2222 \N patch patch_111_112_c.sql|Extend meta_key length to 64 +2223 \N patch patch_112_113_a.sql|schema_version +2224 \N patch patch_112_113_b.sql|Ensure meta_value is not null +2225 \N patch patch_113_114_a.sql|schema_version diff --git a/modules/t/test-genome-DBs/homo_sapiens/core/table.sql b/modules/t/test-genome-DBs/homo_sapiens/core/table.sql index 70bd53327..76867bd9b 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/core/table.sql +++ b/modules/t/test-genome-DBs/homo_sapiens/core/table.sql @@ -485,12 +485,12 @@ CREATE TABLE `marker_synonym` ( CREATE TABLE `meta` ( `meta_id` int(11) NOT NULL AUTO_INCREMENT, `species_id` int(10) unsigned DEFAULT '1', - `meta_key` varchar(40) NOT NULL, + `meta_key` varchar(64) NOT NULL, `meta_value` varchar(255) NOT NULL, PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=2220 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=2226 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL, diff --git a/modules/t/test-genome-DBs/homo_sapiens/empty/meta.txt b/modules/t/test-genome-DBs/homo_sapiens/empty/meta.txt index 87b939fd5..e3a5b80c8 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/empty/meta.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/empty/meta.txt @@ -1,4 +1,4 @@ -1 \N schema_version 111 +1 \N schema_version 114 2 1 assembly.default NCBI34 33 1 species.classification Chordata 32 1 species.classification Vertebrata @@ -126,3 +126,9 @@ 176 \N patch patch_109_110_b.sql|Add IS_PAR relationship to link X- and Y-PAR genes 177 \N patch patch_109_110_c.sql|Allow gene id to belong to multiple alt allele groups 178 \N patch patch_110_111_a.sql|schema_version +179 \N patch patch_111_112_a.sql|schema_version +180 \N patch patch_111_112_b.sql|Allow meta_value to be null +181 \N patch patch_111_112_c.sql|Extend meta_key length to 64 +182 \N patch patch_112_113_a.sql|schema_version +183 \N patch patch_112_113_b.sql|Ensure meta_value is not null +184 \N patch patch_113_114_a.sql|schema_version diff --git a/modules/t/test-genome-DBs/homo_sapiens/empty/table.sql b/modules/t/test-genome-DBs/homo_sapiens/empty/table.sql index 4c32c6caf..e5db71a2e 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/empty/table.sql +++ b/modules/t/test-genome-DBs/homo_sapiens/empty/table.sql @@ -490,12 +490,12 @@ CREATE TABLE `marker_synonym` ( CREATE TABLE `meta` ( `meta_id` int(11) NOT NULL AUTO_INCREMENT, `species_id` int(10) unsigned DEFAULT '1', - `meta_key` varchar(40) NOT NULL, + `meta_key` varchar(64) NOT NULL, `meta_value` varchar(255) NOT NULL, PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=179 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=185 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL DEFAULT '', diff --git a/modules/t/test-genome-DBs/hp_dump/core/meta.txt b/modules/t/test-genome-DBs/hp_dump/core/meta.txt index d401f5666..9a3a430a0 100644 --- a/modules/t/test-genome-DBs/hp_dump/core/meta.txt +++ b/modules/t/test-genome-DBs/hp_dump/core/meta.txt @@ -1,5 +1,5 @@ 1 \N schema_type core -2 \N schema_version 111 +2 \N schema_version 114 3 \N patch patch_98_99_a.sql|schema_version 2124 1 xref.timestamp 2013-07-22 11:20:10 4 \N patch patch_52_53_c.sql|identity_xref_rename @@ -265,3 +265,9 @@ 2221 \N patch patch_109_110_b.sql|Add IS_PAR relationship to link X- and Y-PAR genes 2222 \N patch patch_109_110_c.sql|Allow gene id to belong to multiple alt allele groups 2223 \N patch patch_110_111_a.sql|schema_version +2224 \N patch patch_111_112_a.sql|schema_version +2225 \N patch patch_111_112_b.sql|Allow meta_value to be null +2226 \N patch patch_111_112_c.sql|Extend meta_key length to 64 +2227 \N patch patch_112_113_a.sql|schema_version +2228 \N patch patch_112_113_b.sql|Ensure meta_value is not null +2229 \N patch patch_113_114_a.sql|schema_version diff --git a/modules/t/test-genome-DBs/hp_dump/core/table.sql b/modules/t/test-genome-DBs/hp_dump/core/table.sql index df7288b40..f6e687e50 100644 --- a/modules/t/test-genome-DBs/hp_dump/core/table.sql +++ b/modules/t/test-genome-DBs/hp_dump/core/table.sql @@ -485,12 +485,12 @@ CREATE TABLE `marker_synonym` ( CREATE TABLE `meta` ( `meta_id` int(11) NOT NULL AUTO_INCREMENT, `species_id` int(10) unsigned DEFAULT '1', - `meta_key` varchar(40) NOT NULL, + `meta_key` varchar(64) NOT NULL, `meta_value` varchar(255) NOT NULL, PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=2224 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=2230 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL, diff --git a/modules/t/test-genome-DBs/multi/compara/meta.txt b/modules/t/test-genome-DBs/multi/compara/meta.txt index d16a1c75c..69cf87d7d 100644 --- a/modules/t/test-genome-DBs/multi/compara/meta.txt +++ b/modules/t/test-genome-DBs/multi/compara/meta.txt @@ -1,6 +1,6 @@ 2 \N schema_type compara 3 \N patch patch_98_99_a.sql|schema_version -165 \N schema_version 111 +172 \N schema_version 114 4 \N patch patch_72_73_b.sql|homology_genetree_links 6 \N patch patch_73_74_a.sql|schema_version 7 \N patch patch_73_74_b.sql|hmm_profile @@ -126,3 +126,7 @@ 163 \N patch patch_109_110_b.sql|case_insensitive_stable_id_again 164 \N patch patch_109_110_c.sql|ncbi_taxa_name_varchar500 166 \N patch patch_110_111_a.sql|schema_version +168 \N patch patch_111_112_a.sql|schema_version +170 \N patch patch_112_113_a.sql|schema_version +171 \N patch patch_112_113_b.sql|meta_key_64 +173 \N patch patch_113_114_a.sql|schema_version diff --git a/modules/t/test-genome-DBs/multi/compara/table.sql b/modules/t/test-genome-DBs/multi/compara/table.sql index 49de3d28b..01d92fd28 100644 --- a/modules/t/test-genome-DBs/multi/compara/table.sql +++ b/modules/t/test-genome-DBs/multi/compara/table.sql @@ -436,12 +436,12 @@ CREATE TABLE `member_xref` ( CREATE TABLE `meta` ( `meta_id` int(11) NOT NULL AUTO_INCREMENT, `species_id` int(10) unsigned DEFAULT '1', - `meta_key` varchar(40) NOT NULL, + `meta_key` varchar(64) NOT NULL, `meta_value` text NOT NULL, PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`(255)), KEY `species_value_idx` (`species_id`,`meta_value`(255)) -) ENGINE=MyISAM AUTO_INCREMENT=167 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=174 DEFAULT CHARSET=latin1; CREATE TABLE `method_link` ( `method_link_id` int(10) unsigned NOT NULL AUTO_INCREMENT, diff --git a/modules/t/test-genome-DBs/s_cerevisiae/core/meta.txt b/modules/t/test-genome-DBs/s_cerevisiae/core/meta.txt index fe9f9a707..2f62db00f 100644 --- a/modules/t/test-genome-DBs/s_cerevisiae/core/meta.txt +++ b/modules/t/test-genome-DBs/s_cerevisiae/core/meta.txt @@ -1,5 +1,5 @@ 1 \N schema_type core -2 \N schema_version 111 +2 \N schema_version 112 3 \N patch patch_98_99_a.sql|schema_version 4 \N patch patch_60_61_b.sql|create_seq_region_synonym_table 5 \N patch patch_60_61_c.sql|rejig_object_xref_indexes @@ -190,3 +190,5 @@ 679 \N patch patch_109_110_b.sql|Add IS_PAR relationship to link X- and Y-PAR genes 680 \N patch patch_109_110_c.sql|Allow gene id to belong to multiple alt allele groups 681 \N patch patch_110_111_a.sql|schema_version +682 \N patch patch_111_112_a.sql|schema_version +683 \N patch patch_111_112_b.sql|Allow meta_value to be null diff --git a/modules/t/test-genome-DBs/s_cerevisiae/core/table.sql b/modules/t/test-genome-DBs/s_cerevisiae/core/table.sql index 7ade412de..2bfa40eba 100644 --- a/modules/t/test-genome-DBs/s_cerevisiae/core/table.sql +++ b/modules/t/test-genome-DBs/s_cerevisiae/core/table.sql @@ -486,11 +486,11 @@ CREATE TABLE `meta` ( `meta_id` int(11) NOT NULL AUTO_INCREMENT, `species_id` int(10) unsigned DEFAULT '1', `meta_key` varchar(40) NOT NULL, - `meta_value` varchar(255) NOT NULL, + `meta_value` varchar(255) DEFAULT NULL, PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=682 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=684 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL, diff --git a/nextflow/config/xref.config b/nextflow/config/xref.config new file mode 100644 index 000000000..024f80e68 --- /dev/null +++ b/nextflow/config/xref.config @@ -0,0 +1,85 @@ +includeConfig './base.config' + +params.pipeline_dir = "$PWD" +params.user = "$USER" +params.email = "${params.user}@ebi.ac.uk" +params.email_server = "hh-smtp.ebi.ac.uk:25" + +params.work_dir = "$BASE_DIR" +params.scripts_dir = "${params.work_dir}/ensembl-production/src/python/scripts/" +params.perl_scripts_dir = "${params.work_dir}/ensembl-production/scripts/xrefs/" + +params.config_file = "${params.work_dir}/ensembl-production/src/python/ensembl/xrefs/config/xref_all_sources.json" +params.sources_config_file = "${params.work_dir}/ensembl-production/src/python/ensembl/xrefs/config/xref_config.ini" +params.source_db_url = '' +params.skip_download = 0 +params.reuse_db = 0 +params.skip_preparse = 1 +params.split_files_by_species = 1 +params.tax_ids_file = '' +params.update_mode = 0 + +params.base_path = '' +params.clean_files = 1 +params.clean_dir = "${params.base_path}/clean_files" + +trace { + enabled = true + file = "trace" + overwrite = true +} + +report { + overwrite = true + file = "report.html" + enable = true +} + +profiles { + + lsf { + process { + errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } + executor = 'lsf' + queue = 'production' + queueSize = 100 + maxRetries = 3 + withLabel:small_process { + memory = 200.MB + //very specific to lsf + executor.perTaskReserve = 200.MB + } + withLabel: dm { + queue = 'datamover' + time = '2h' + } + } + } + + slurm { + process { + errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'finish' } + executor = 'slurm' + queue = 'production' + queueSize = 100 + maxRetries = 3 + time = '1d' + + withLabel:small_process { + memory = 200.MB + } + + withLabel: dm { + queue = 'datamover' + time = '3h' + memory = 2.GB + } + withLabel:mem4GB { + time = '5d' + memory = 4.GB + } + } + } +} + + diff --git a/nextflow/workflows/xrefDownload.nf b/nextflow/workflows/xrefDownload.nf new file mode 100644 index 000000000..65e255fda --- /dev/null +++ b/nextflow/workflows/xrefDownload.nf @@ -0,0 +1,243 @@ +#!/usr/bin/env nextflow + +// Parameter default values +params.pipeline_name = 'Xref Download Pipeline' +params.help = false + +println """\ + XREF DOWNLOAD PIPELINE + ====================== + source_db_url : ${params.source_db_url} + base_path : ${params.base_path} + reuse_db : ${params.reuse_db} + skip_download : ${params.skip_download} + skip_preparse : ${params.skip_preparse} + clean_files : ${params.clean_files} + split_files_by_species : ${params.split_files_by_species} + config_file : ${params.config_file} + sources_config_file : ${params.sources_config_file} + clean_dir : ${params.clean_dir} + tax_ids_file : ${params.tax_ids_file} + update_mode : ${params.update_mode} + """ + .stripIndent() + +def helpMessage() { + log.info""" + Usage: + nextflow run ensembl-production/xrefDownload.nf + --source_db_url (mandatory) Database URL to store information about xref sources. + Syntax: 'mysql://user:password@host:port/dbname' + + --base_path (mandatory) Path where log and source files will be stored, + a scratch space with sufficient storage is recommended. + + --reuse_db (optional) If set to 1, an existing source database (specified in --source_db_url) will be reused. + Default: 0 + + --skip_download (optional) If set to 1, source files will only be downloaded if they don't already exist in --base_path. + Default: 0 + + --skip_preparse (optional) If set to 1, the pre-parse step will be skipped (no central DB). + Default: 1 + + --clean_files (optional) If set to 1, the Cleanup analysis will be run for RefSeq and UniProt files. + Default: 1 + + --split_files_by_species (optional) If set to 1, UniProt and RefSeq file will be split according to taxonomy ID. + Default: 1 + + --config_file (optional) Path to the json file containing information about xref sources to download. + Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_all_sources.json + + --sources_config_file (optional) Path to the ini file containing information about all xref sources and species/divisions. + Default: $BASE_DIR/ensembl_nf/src/python/ensembl/xrefs/config/xref_config.ini + + --clean_dir (optional) Path where to save the cleaned up files. + Default: [--base_path]/clean_files + + --tax_ids_file (optional) Path to the file containing the taxonomy IDs of the species to extract data for. + Used to update the data for the provided species. + + --update_mode (optional) If set to 1, pipeline is in update mode, refreshing/updating its data for new taxonomy IDs. + Only used if --tax_ids_file is set. Default: 0 + """.stripIndent() +} + +workflow { + if (params.help || !params.source_db_url || !params.base_path) { + helpMessage() + + if (!params.source_db_url) { + println """ + Missing required param source_db_url + """.stripIndent() + } + if (!params.base_path) { + println """ + Missing required param base_path + """.stripIndent() + } + + exit 1 + } + + ScheduleDownload() + timestamp = ScheduleDownload.out[0] + + DownloadSource(ScheduleDownload.out[1].splitText(), timestamp) + + CleanupTmpFiles(DownloadSource.out.collect()) + ScheduleCleanup(CleanupTmpFiles.out, timestamp) + + Checksum(ScheduleCleanup.out[0], timestamp) + if (params.split_files_by_species) { + CleanupSplitSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) + NotifyByEmail(Checksum.out.concat(CleanupSplitSource.out.collect()).collect(), timestamp) + } else { + CleanupSource(ScheduleCleanup.out[1].ifEmpty([]).splitText(), timestamp) + NotifyByEmail(Checksum.out.concat(CleanupSource.out.collect()).collect(), timestamp) + } +} + +process ScheduleDownload { + label 'small_process' + + output: + val timestamp + path 'dataflow_sources.json' + + script: + timestamp = new java.util.Date().format("yyyyMMdd_HHmmss") + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleDownload --config_file ${params.config_file} --source_db_url ${params.source_db_url} --reuse_db ${params.reuse_db} --skip_preparse ${params.skip_preparse} --base_path ${params.base_path} --log_timestamp $timestamp + """ +} + +process DownloadSource { + label 'dm' + tag "$src_name" + + input: + val x + val timestamp + + output: + val 'DownloadSourceDone' + + shell: + src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.DownloadSource --dataflow '$x' --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} + """ +} + +process CleanupTmpFiles { + label 'small_process' + + input: + val x + + output: + val 'TmpCleanupDone' + + """ + find ${params.base_path} -type f -name "*.tmp" -delete + """ +} + +process ScheduleCleanup { + label 'small_process' + + input: + val x + val timestamp + + output: + val 'ScheduleCleanupDone' + path 'dataflow_cleanup_sources.json' + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.ScheduleCleanup --base_path ${params.base_path} --source_db_url ${params.source_db_url} --clean_files ${params.clean_files} --clean_dir ${params.clean_dir} --split_files_by_species ${params.split_files_by_species} --log_timestamp $timestamp + """ +} + +process Checksum { + label 'default_process' + + input: + val x + val timestamp + + output: + val 'ChecksumDone' + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.Checksum --base_path ${params.base_path} --source_db_url ${params.source_db_url} --skip_download ${params.skip_download} --log_timestamp $timestamp + """ +} + +process CleanupSplitSource { + label 'mem4GB' + tag "$src_name" + + input: + each x + val timestamp + + output: + val 'CleanupDone' + + shell: + cmd_params = "" + src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + if (x =~ /"version_file":/) { + version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] + cmd_params = "${cmd_params} --version_file '${version_file}'" + } + if (params.tax_ids_file) { + cmd_params = "${cmd_params} --tax_ids_file ${params.tax_ids_file}" + } + + """ + perl ${params.perl_scripts_dir}/cleanup_and_split_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --clean_files ${params.clean_files} --update_mode ${params.update_mode} $cmd_params + """ +} + +process CleanupSource { + label 'mem4GB' + tag "$src_name" + + input: + val x + val timestamp + + output: + val 'CleanupDone' + + shell: + cmd_params = "" + src_name = (x =~ /"name":\s*"([A-Za-z0-9_.-\/]+)"/)[0][1] + if (x =~ /"version_file":/) { + version_file = (x =~ /"version_file":\s*"(.*?)"/)[0][1] + cmd_params = "${cmd_params} --version_file '${version_file}'" + } + + """ + perl ${params.perl_scripts_dir}/cleanup_source.pl --base_path ${params.base_path} --log_timestamp $timestamp --source_db_url ${params.source_db_url} --name $src_name --clean_dir ${params.clean_dir} --skip_download ${params.skip_download} --clean_files ${params.clean_files} $cmd_params + """ +} + +process NotifyByEmail { + label 'small_process' + + input: + val x + val timestamp + + """ + python ${params.scripts_dir}/run_module.py --module ensembl.production.xrefs.EmailNotification --pipeline_name '${params.pipeline_name}' --base_path ${params.base_path} --email ${params.email} --email_server ${params.email_server} --log_timestamp $timestamp + """ +} diff --git a/requirements.in b/requirements.in index 0a96ae592..ff57a0d74 100755 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,5 @@ requests>=2.25.0,<3 pydantic~=1.10.5 -graphene~=2.1.9 -graphene-sqlalchemy~=2.3.0 -git+https://github.com/Ensembl/ensembl-metadata-api.git@2.1.0a1#egg=ensembl-metadata-api +ensembl-py>=2.1.0 +ensembl-utils>=0.4.4 +git+https://github.com/Ensembl/ensembl-metadata-api.git@3.3.0a1#egg=ensembl-metadata-api \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ba89f7a44..54aeade70 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,144 +1,55 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile requirements.in +# pip-compile --output-file=requirements.txt requirements.in # -aniso8601==7.0.0 - # via graphene certifi==2024.2.2 - # via - # ensembl-metadata-api - # requests + # via requests charset-normalizer==3.3.2 - # via - # ensembl-metadata-api - # requests -ensembl-hive @ git+https://github.com/Ensembl/ensembl-hive.git - # via - # ensembl-metadata-api - # ensembl-py -ensembl-metadata-api @ git+https://github.com/Ensembl/ensembl-metadata-api.git@2.1.0a1 + # via requests +ensembl-metadata-api @ git+https://github.com/Ensembl/ensembl-metadata-api.git@3.3.0a1 # via -r requirements.in -ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@1.2.2 - # via ensembl-metadata-api -exceptiongroup==1.2.0 - # via - # ensembl-metadata-api - # pytest -graphene==2.1.9 - # via - # -r requirements.in - # graphene-sqlalchemy -graphene-sqlalchemy==2.3.0 +ensembl-py==2.1.3 # via -r requirements.in -graphql-core==2.3.2 - # via - # graphene - # graphql-relay -graphql-relay==2.0.1 - # via graphene -greenlet==3.0.3 - # via - # ensembl-metadata-api - # sqlalchemy -grpcio==1.62.0 - # via - # ensembl-metadata-api - # grpcio-reflection - # grpcio-tools -grpcio-reflection==1.62.0 - # via ensembl-metadata-api -grpcio-tools==1.62.0 - # via ensembl-metadata-api +ensembl-utils==0.5.0 + # via -r requirements.in +exceptiongroup==1.2.2 + # via pytest +greenlet==3.1.1 + # via sqlalchemy idna==3.6 - # via - # ensembl-metadata-api - # requests + # via requests iniconfig==2.0.0 - # via - # ensembl-metadata-api - # pytest -mysqlclient==2.1.1 - # via - # ensembl-metadata-api - # ensembl-py -packaging==23.2 - # via - # ensembl-metadata-api - # pytest -pluggy==1.4.0 - # via - # ensembl-metadata-api - # pytest -promise==2.3 - # via - # graphene-sqlalchemy - # graphql-core - # graphql-relay -protobuf==4.25.3 - # via - # ensembl-metadata-api - # grpcio-reflection - # grpcio-tools -pydantic==1.10.9 + # via pytest +packaging==24.1 + # via pytest +pluggy==1.5.0 + # via pytest +pydantic==1.10.14 # via -r requirements.in -pytest==8.0.2 - # via - # ensembl-metadata-api - # ensembl-py - # pytest-dependency -pytest-dependency==0.5.1 - # via - # ensembl-metadata-api - # ensembl-py -python-dotenv==0.19.2 - # via - # ensembl-metadata-api - # ensembl-py -pyyaml==6.0.1 - # via - # ensembl-metadata-api - # ensembl-py +pytest==8.3.3 + # via ensembl-utils +python-dotenv==1.0.1 + # via ensembl-utils +pyyaml==6.0.2 + # via ensembl-utils requests==2.31.0 # via # -r requirements.in - # ensembl-metadata-api - # ensembl-py -rx==1.6.3 - # via graphql-core -singledispatch==3.7.0 - # via graphene-sqlalchemy -six==1.16.0 - # via - # graphene - # graphene-sqlalchemy - # graphql-core - # graphql-relay - # promise - # singledispatch -sqlalchemy==1.4.52 + # ensembl-utils +sqlalchemy==2.0.35 # via - # ensembl-metadata-api # ensembl-py - # graphene-sqlalchemy + # ensembl-utils # sqlalchemy-utils -sqlalchemy-utils==0.38.3 - # via - # ensembl-metadata-api - # ensembl-py +sqlalchemy-utils==0.41.2 + # via ensembl-utils tomli==2.0.1 + # via pytest +typing-extensions==4.10.0 # via - # ensembl-metadata-api - # pytest -types-pymysql==1.1.0.1 - # via ensembl-metadata-api -typing-extensions==4.6.3 - # via pydantic + # pydantic + # sqlalchemy urllib3==1.26.18 - # via - # ensembl-metadata-api - # requests - -# The following packages are considered to be unsafe in a requirements file: -# setuptools + # via requests diff --git a/scripts/copyrights/update_copyrights.sh b/scripts/copyrights/update_copyrights.sh index bf18476e1..490acc32b 100755 --- a/scripts/copyrights/update_copyrights.sh +++ b/scripts/copyrights/update_copyrights.sh @@ -50,6 +50,7 @@ for repo in $repositories; do git clone --depth 1 --branch main git@github.com:${repo} ${tmp_dir}/${repo} if [ $? -eq 0 ]; then cd ${tmp_dir}/${repo} + git push origin --delete bau/copyright-${year} git checkout -b bau/copyright-${year} perl ${ENSEMBL_ROOT_DIR}/ensembl/misc-scripts/annual_copyright_updater.sh git commit -a -m "${year} copyright update" @@ -65,6 +66,7 @@ for repo in $repositories; do fi else echo 'failed to push commits and open a pull request.'; + git push origin --delete bau/copyright-${year} fi else echo 'failed to commit updates.'; diff --git a/scripts/py/regulation_ftp_symlinks.py b/scripts/py/regulation_ftp_symlinks.py index be8c30d9e..3ded73f0c 100644 --- a/scripts/py/regulation_ftp_symlinks.py +++ b/scripts/py/regulation_ftp_symlinks.py @@ -35,15 +35,14 @@ """ +import logging from argparse import ArgumentParser from collections import defaultdict -import logging -from os import walk, path, listdir, makedirs +from os import listdir, makedirs, path, walk from pathlib import Path - # Human and Mouse follow a different dir structure -SPECIES_TO_NOT_INCLUDE = ["homo_sapiens", "mus_musculus"] +SPECIES_TO_NOT_INCLUDE = [] # GENE-SWITCH species GENE_SWITCH_SPECIES = [ @@ -140,8 +139,15 @@ def get_species_with_analysis_type_folder(analysis_type, ftp_path): def get_most_recent_release_data_file_path(data_file_path): validator.is_dir(Path(data_file_path)) available_releases = listdir(data_file_path) + releases = [] + for release in available_releases: + try: + releases.append(int(release)) + except: + continue + return Path(data_file_path) / str( - max([int(release) for release in available_releases]) + max(releases) ) @@ -181,12 +187,12 @@ def __init__(self, **path_specifics): def get(self, key): return self.path_specifics.get(key) - def symlink2rf(self, only_remove=False, relative=True): + def symlink2rf(self, analysis_type, only_remove=False, relative=True): target = ( Path(path.relpath(self.target, self.sources["release_folder"])) - / "peaks" + / analysis_type if relative - else self.target / "peaks" + else self.target / analysis_type ) source = self.sources["release_folder"] / self.get("analysis_type") @@ -221,7 +227,9 @@ def _symlink(self, source, target, only_remove): ) else: if not validator.is_symlink(source, check=True): - logger.info("{source} -> {target} -- was successfully removed") + logger.info( + f"{source} -> {target} -- was successfully removed" + ) def aliased_paths(self, **kwargs): return { @@ -243,7 +251,7 @@ def search(analysis_type, ftp_path, release): release=release, ) for species, assemblies in result.items() - for assembly in assemblies + for assembly in assemblies if assembly not in ["GRCh37", "GRCm38", "NCBIM37"] ] @@ -299,7 +307,7 @@ def parse_arguments(): ANALYSIS_TYPE_PEAKS, ftp_path, args.release_version ) for peak in peaks: - peak.symlink2rf(only_remove=args.delete_symlinks) + peak.symlink2rf("peaks", only_remove=args.delete_symlinks) peak.symlink2misc("peaks", only_remove=args.delete_symlinks) logger.info("Searching for signals in data_files ...") @@ -307,8 +315,7 @@ def parse_arguments(): ANALYSIS_TYPE_SIGNAL, ftp_path, args.release_version ) for signal in signals: - signal.symlink2rf(only_remove=args.release_version) - + signal.symlink2rf("signal", only_remove=args.delete_symlinks) signal.symlink2misc("signal", only_remove=args.delete_symlinks) logger.info("Process Completed") diff --git a/scripts/xrefs/cleanup_and_split_source.pl b/scripts/xrefs/cleanup_and_split_source.pl new file mode 100644 index 000000000..3beabbcd6 --- /dev/null +++ b/scripts/xrefs/cleanup_and_split_source.pl @@ -0,0 +1,291 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2024] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; +use Data::Dumper; +use Getopt::Long; +use Carp; +use DBI; +use File::Path qw/make_path rmtree/; +use File::Spec::Functions; +use HTTP::Tiny; +use JSON; +use File::Basename; +use POSIX qw(strftime); + +use Nextflow::Utils; + +my ($base_path, $source_db_url, $source_name, $clean_dir, $clean_files, $version_file, $tax_ids_file, $update_mode, $log_timestamp); +GetOptions( + 'base_path=s' => \$base_path, + 'source_db_url=s' => \$source_db_url, + 'name=s' => \$source_name, + 'clean_dir=s' => \$clean_dir, + 'clean_files=i' => \$clean_files, + 'version_file:s' => \$version_file, + 'tax_ids_file:s' => \$tax_ids_file, + 'update_mode:i' => \$update_mode, + 'log_timestamp:s' => \$log_timestamp +); + +# Check that all mandatory parameters are passed +if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($clean_files)) { + croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --clean_files [--version_file ] [--tax_ids_file ] [--update_mode ] [--log_timestamp ]"; +} + +if (!defined($update_mode)) {$update_mode = 0;} + +my $log_file; +if (defined($log_timestamp)) { + my $log_path = catdir($base_path, 'logs', $log_timestamp); + make_path($log_path); + $log_file = catfile($log_path, "tmp_logfile_CleanupSplitSource_".int(rand(500))); + + add_to_log_file($log_file, "CleanupSplitSource starting for source $source_name"); + add_to_log_file($log_file, "Param: tax_ids_file = $tax_ids_file"); +} + +# Do nothing if not a uniprot or refseq source +if ($source_name !~ /^Uniprot/ && $source_name !~ /^RefSeq_/) { + add_to_log_file($log_file, "Provided source name is invalid. Can only clean up and split Uniprot or RefSeq files."); + exit; +} + +# Remove last '/' character if it exists +if ($base_path =~ /\/$/) {chop($base_path);} + +# Remove / char from source name to access directory +my $clean_name = $source_name; +$clean_name =~ s/\///g; + +my $output_path = $clean_dir."/".$clean_name; + +# Create needed directories +if (!$update_mode) { + rmtree($output_path); +} +make_path($output_path); + +my $sources_to_remove; +my ($is_uniprot, $is_refseq_dna, $is_refseq_peptide) = (0, 0, 0); + +# Decide which files are being processed +my $output_file_name = ''; +if ($source_name =~ /^Uniprot/) { + $is_uniprot = 1; + $output_file_name = ($source_name =~ /SPTREMBL/ ? 'uniprot_trembl' : 'uniprot_sprot'); + + # Set sources to skip in parsing step + my @source_names = ( + 'GO', 'UniGene', 'RGD', 'CCDS', 'IPI', 'UCSC', 'SGD', 'HGNC', 'MGI', 'VGNC', 'Orphanet', + 'ArrayExpress', 'GenomeRNAi', 'EPD', 'Xenbase', 'Reactome', 'MIM_GENE', 'MIM_MORBID', 'MIM', + 'Interpro' + ); + $sources_to_remove = join("|", @source_names); +} elsif ($source_name =~ /^RefSeq_dna/) { + $is_refseq_dna = 1; + $output_file_name = 'refseq_rna'; +} elsif ($source_name =~ /^RefSeq_peptide/) { + $is_refseq_peptide = 1; + $output_file_name = 'refseq_protein'; +} else { + croak "Unknown file type $source_name"; +} + +# Extract taxonomy IDs +my %tax_ids; +my ($skipped_species, $added_species) = (0, 0); +if ($tax_ids_file && $update_mode) { + open my $fh, '<', $tax_ids_file; + chomp(my @lines = <$fh>); + close $fh; + %tax_ids = map { $_ => 1 } @lines; + + # Check if any taxonomy IDs already have files + foreach my $tax_id (keys(%tax_ids)) { + my @tax_files = glob($output_path . "/**/**/**/**/" . $output_file_name . "-" . $tax_id); + if (scalar(@tax_files) > 0) { + $tax_ids{$tax_id} = 0; + $skipped_species++; + } + } + + # Do nothing if all taxonomy IDs already have files + if ($skipped_species == scalar(keys(%tax_ids))) { + add_to_log_file($log_file, "All provided tax IDs already have files. Doing nothing."); + exit; + } +} + +# Get all files for source +my $files_path = $base_path."/".$clean_name; +my @files = glob($files_path."/*"); +my $out_fh; +my $current_species_id; + +# Process each file +foreach my $input_file_name (@files) { + local $/ = "//\n"; + + add_to_log_file($log_file, "Splitting up file $input_file_name"); + + $input_file_name = basename($input_file_name); + my $input_file = $files_path."/".$input_file_name; + my $in_fh; + + # Skip the release file + if (defined($version_file) && $input_file eq $version_file) {next;} + + # Open file normally or with zcat for zipped filed + if ($input_file_name =~ /\.(gz|Z)$/x) { + open($in_fh, "zcat $input_file |") or die "Couldn't call 'zcat' to open input file '$input_file' $!"; + $output_file_name =~ s/\.[^.]+$//; + } else { + open($in_fh, '<', $input_file) or die "Couldn't open file input '$input_file' $!"; + } + + # Only start processing if could get filehandle + if (defined($in_fh)) { + my ($write_path, $write_file); + + # Read full records + while (my $record = $in_fh->getline()) { + # Extract the species id from record + my $species_id; + if ($is_uniprot) { + ($species_id) = $record =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/; + $species_id =~ s/\s// if $species_id; + } else { + ($species_id) = $record =~ /db_xref=.taxon:(\d+)/; + } + + # Only continue with wanted species + next if (!$species_id); + next if ($tax_ids_file && (!defined($tax_ids{$species_id}) || !$tax_ids{$species_id})); + + # Clean up data + if ($clean_files) { + if ($is_uniprot) { + $record =~ s/\nR(N|P|X|A|T|R|L|C|G)\s{3}.*//g; # Remove references lines + $record =~ s/\nCC(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCT$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set specific caution comment to temporary + $record =~ s/\nCC\s{3}.*//g; # Remove comments + $record =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment + $record =~ s/\nFT\s{3}.*//g; # Remove feature coordinates + $record =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing + } else { + my $skip_data = 0; + my @lines = split("\n", $record); + my @new_record; + + for my $line (@lines) { + if ($is_refseq_dna) { + if ($line =~ /^REFERENCE/ || $line =~ /^COMMENT/ || $line =~ /^\s{5}exon/ || $line =~ /^\s{5}misc_feature/ || $line =~ /^\s{5}variation/) { + $skip_data = 1; + } elsif ($line =~ /^\s{5}source/ || $line =~ /^ORIGIN/) { + $skip_data = 0; + } + } elsif ($is_refseq_peptide) { + if ($line =~ /^REFERENCE/ || $line =~ /^COMMENT/ || $line =~ /^\s{5}Protein/) { + $skip_data = 1; + } elsif ($line =~ /^\s{5}source/ || $line =~ /^\s{5}CDS/ || $line =~ /^ORIGIN/) { + $skip_data = 0; + } + } + + if (!$skip_data) { + push(@new_record, $line); + } + + $record = join("\n", @new_record); + } + } + } + + # Write the record in the appropriate file + if (!defined($current_species_id) || (defined($current_species_id) && $species_id ne $current_species_id)) { + close($out_fh) if (defined($current_species_id)); + + my $species_id_str = sprintf("%04d", $species_id); + my @digits = split('', $species_id_str); + + $write_path = catdir($output_path, $digits[0], $digits[1], $digits[2], $digits[3]); + make_path($write_path); + + $write_file = $write_path."/".$output_file_name."-".$species_id; + + # Check if creating new file + if (!-e $write_file) { + $added_species++; + } + + open($out_fh, '>>', $write_file) or die "Couldn't open output file '$write_file' $!"; + + $current_species_id = $species_id; + } + + print $out_fh $record.($is_uniprot ? "" : "\n"); + } + + close($in_fh); + close($out_fh) if $out_fh; + } +} + +add_to_log_file($log_file, "Source $source_name cleaned up"); +add_to_log_file($log_file, "$source_name skipped species = $skipped_species"); +add_to_log_file($log_file, "$source_name species files created = $added_species"); + +# Save the clean files directory in source db +my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url); +my $dbi = get_dbi($host, $port, $user, $pass, $source_db); +my $update_version_sth = $dbi->prepare("UPDATE IGNORE version set clean_uri=? where source_id=(SELECT source_id FROM source WHERE name=?)"); +$update_version_sth->execute($output_path, $source_name); +$update_version_sth->finish(); + +sub get_dbi { + my ($host, $port, $user, $pass, $dbname) = @_; + my $dbconn; + if (defined $dbname) { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); + } else { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + } + my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); + return $dbi; +} + +sub parse_url { + my ($url) = @_; + my $parsed_url = Nextflow::Utils::parse($url); + my $user = $parsed_url->{'user'}; + my $pass = $parsed_url->{'pass'}; + my $host = $parsed_url->{'host'}; + my $port = $parsed_url->{'port'}; + my $db = $parsed_url->{'dbname'}; + return ($user, $pass, $host, $port, $db); +} + +sub add_to_log_file { + my ($log_file, $message) = @_; + + if (defined($log_file)) { + my $current_timestamp = strftime "%d-%b-%Y %H:%M:%S", localtime; + + open(my $fh, '>>', $log_file); + print $fh "$current_timestamp | INFO | $message\n"; + close($fh); + } +} diff --git a/scripts/xrefs/cleanup_source.pl b/scripts/xrefs/cleanup_source.pl new file mode 100644 index 000000000..5ce29a0f5 --- /dev/null +++ b/scripts/xrefs/cleanup_source.pl @@ -0,0 +1,235 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2024] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; +use Data::Dumper; +use Getopt::Long; +use Carp; +use DBI; +use File::Path qw/make_path/; +use File::Spec::Functions; + +use Nextflow::Utils; + +my ($base_path, $source_db_url, $source_name, $clean_dir, $skip_download, $clean_files, $version_file, $log_timestamp); +GetOptions( + 'base_path=s' => \$base_path, + 'source_db_url=s' => \$source_db_url, + 'name=s' => \$source_name, + 'clean_dir=s' => \$clean_dir, + 'skip_download=i' => \$skip_download, + 'clean_files=i' => \$clean_files, + 'version_file:s' => \$version_file, + 'log_timestamp:s' => \$log_timestamp +); + +# Check that all mandatory parameters are passed +if (!defined($base_path) || !defined($source_db_url) || !defined($source_name) || !defined($clean_dir) || !defined($skip_download) || !defined($clean_files)) { + croak "Usage: cleanup_source.pl --base_path --source_db_url --name --clean_dir --skip_download --clean_files [--version_file ] [--log_timestamp ]"; +} + +my $log_file; +if (defined($log_timestamp)) { + my $log_path = catdir($base_path, 'logs', $log_timestamp); + make_path($log_path); + $log_file = catfile($log_path, "tmp_logfile_CleanupSource_".int(rand(500))); + + add_to_log_file($log_file, "CleanupSource starting for source $source_name"); +} + +# Do nothing if not cleaning files, not a uniprot or refseq source, or no new download +if ($clean_files && ($source_name =~ /^Uniprot/ || $source_name =~ /^RefSeq_/)) { + # Remove last '/' character if it exists + if ($base_path =~ /\/$/) {chop($base_path);} + + # Remove / char from source name to access directory + my $clean_name = $source_name; + $clean_name =~ s/\///g; + + my $output_path = $clean_dir."/".$clean_name; + my $update_clean_uri = 0; + + # If not a new download, check if clean files exist + if ($skip_download) { + if (-d $output_path) { + $update_clean_uri = 1 + } + } else { + # Create needed directories + make_path($output_path); + + $update_clean_uri = 1; + + my $sources_to_remove; + my ($is_uniprot, $is_refseq_dna, $is_refseq_peptide) = (0, 0, 0); + my $file_size = 0; + + # Set sources to skip in parsing step (uniprot only) + if ($source_name =~ /^Uniprot/) { + $is_uniprot = 1; + my @source_names = ( + 'GO', 'UniGene', 'RGD', 'CCDS', 'IPI', 'UCSC', 'SGD', 'HGNC', 'MGI', 'VGNC', 'Orphanet', + 'ArrayExpress', 'GenomeRNAi', 'EPD', 'Xenbase', 'Reactome', 'MIM_GENE', 'MIM_MORBID', 'MIM', + 'Interpro' + ); + $sources_to_remove = join("|", @source_names); + $file_size = 200000; + } elsif ($source_name =~ /^RefSeq_dna/) { + $is_refseq_dna = 1; + } elsif ($source_name =~ /^RefSeq_peptide/) { + $is_refseq_peptide = 1; + } else { + croak "Unknown file type $source_name"; + } + + # Get all files for source + my $files_path = $base_path."/".$clean_name; + my @files = `ls $files_path`; + foreach my $file_name (@files) { + $file_name =~ s/\n//; + my $file = $files_path."/".$file_name; + + # Skip the release file + if (defined($version_file) && $file eq $version_file) {next;} + + my ($in_fh, $out_fh); + my $output_file = $file_name; + + # Open file normally or with zcat for zipped filed + if ($file_name =~ /\.(gz|Z)$/x) { + open($in_fh, "zcat $file |") + or die "Couldn't call 'zcat' to open input file '$file' $!"; + + $output_file =~ s/\.[^.]+$//; + } else { + open($in_fh, '<', $file) + or die "Couldn't open file input '$file' $!"; + } + + # Only start cleaning up if could get filehandle + my $count = 0; + my $file_count = 1; + if (defined($in_fh)) { + if ($is_uniprot) { + local $/ = "//\n"; + + my $write_file = $output_path."/".$output_file . "-$file_count"; + open($out_fh, '>', $write_file) or die "Couldn't open output file '$write_file' $!"; + + # Read full records + while ($_ = $in_fh->getline()) { + # Remove unused data + $_ =~ s/\nR(N|P|X|A|T|R|L|C|G)\s{3}.*//g; # Remove references lines + $_ =~ s/\nCC(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCT$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set specific caution comment to temporary + $_ =~ s/\nCC\s{3}.*//g; # Remove comments + $_ =~ s/\nCT(\s{3}.*)CAUTION: The sequence shown here is derived from an Ensembl(.*)/\nCC$1CAUTION: The sequence shown here is derived from an Ensembl$2/g; # Set temp line back to comment + $_ =~ s/\nFT\s{3}.*//g; # Remove feature coordinates + $_ =~ s/\nDR\s{3}($sources_to_remove);.*//g; # Remove sources skipped at processing + + # Added lines that we do need into output + print $out_fh $_; + + # Check how many lines have been processed and write to new file if size exceeded + $count++; + if ($count > $file_size) { + close($out_fh); + $file_count++; + $write_file = $output_path."/".$output_file . "-$file_count"; + open($out_fh, '>', $write_file) + or die "Couldn't open output file '$write_file' $!"; + $count = 0; + } + } + + close($in_fh); + close($out_fh); + } else { + $output_file = $output_path."/".$output_file; + open($out_fh, '>', $output_file) or die "Couldn't open output file '$output_file' $!"; + + # Remove unuused data + my $skip_data = 0; + while (<$in_fh>) { + if ($is_refseq_dna) { + if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}exon/ || $_ =~ /^\s{5}misc_feature/ || $_ =~ /^\s{5}variation/) { + $skip_data = 1; + } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^ORIGIN/) { + $skip_data = 0; + } + } elsif ($is_refseq_peptide) { + if ($_ =~ /^REFERENCE/ || $_ =~ /^COMMENT/ || $_ =~ /^\s{5}Protein/) { + $skip_data = 1; + } elsif ($_ =~ /^\s{5}source/ || $_ =~ /^\s{5}CDS/ || $_ =~ /^ORIGIN/) { + $skip_data = 0; + } + } + + if (!$skip_data) {print $out_fh $_;} + } + + close($in_fh); + close($out_fh); + } + } + } + + add_to_log_file($log_file, "Source $source_name cleaned up"); + } + + # Save the clean files directory in source db + if ($update_clean_uri) { + my ($user, $pass, $host, $port, $source_db) = parse_url($source_db_url); + my $dbi = get_dbi($host, $port, $user, $pass, $source_db); + my $update_version_sth = $dbi->prepare("UPDATE IGNORE version set clean_uri=? where source_id=(SELECT source_id FROM source WHERE name=?)"); + $update_version_sth->execute($output_path, $source_name); + $update_version_sth->finish(); + } +} + +sub get_dbi { + my ($host, $port, $user, $pass, $dbname) = @_; + my $dbconn; + if (defined $dbname) { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname); + } else { + $dbconn = sprintf("dbi:mysql:host=%s;port=%s", $host, $port); + } + my $dbi = DBI->connect( $dbconn, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); + return $dbi; +} + +sub parse_url { + my ($url) = @_; + my $parsed_url = Nextflow::Utils::parse($url); + my $user = $parsed_url->{'user'}; + my $pass = $parsed_url->{'pass'}; + my $host = $parsed_url->{'host'}; + my $port = $parsed_url->{'port'}; + my $db = $parsed_url->{'dbname'}; + return ($user, $pass, $host, $port, $db); +} + +sub add_to_log_file { + my ($log_file, $message) = @_; + + if (defined($log_file)) { + my $current_timestamp = strftime "%d-%b-%Y %H:%M:%S", localtime; + + open(my $fh, '>>', $log_file); + print $fh "$current_timestamp | INFO | $message\n"; + close($fh); + } +} diff --git a/sql/patch_109_110_a.sql b/sql/patch_109_110_a.sql index 427981eec..27bc2d892 100644 --- a/sql/patch_109_110_a.sql +++ b/sql/patch_109_110_a.sql @@ -13,7 +13,7 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -# patch_108_109_a.sql +# patch_109_110_a.sql # # Title: Update schema version. # diff --git a/sql/patch_110_111_a.sql b/sql/patch_110_111_a.sql index 91ba1c712..e8725ff82 100644 --- a/sql/patch_110_111_a.sql +++ b/sql/patch_110_111_a.sql @@ -13,7 +13,7 @@ -- See the License for the specific language governing permissions and -- limitations under the License. -# patch_108_109_a.sql +# patch_110_111_a.sql # # Title: Update schema version. # diff --git a/sql/patch_111_112_a.sql b/sql/patch_111_112_a.sql new file mode 100644 index 000000000..4394d65ed --- /dev/null +++ b/sql/patch_111_112_a.sql @@ -0,0 +1,27 @@ +-- Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +-- Copyright [2016-2024] EMBL-European Bioinformatics Institute +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +# patch_111_112_a.sql +# +# Title: Update schema version. +# +# Description: +# Update schema_version in meta table to 112. + +UPDATE meta SET meta_value='112' WHERE meta_key='schema_version'; + +# Patch identifier +INSERT INTO meta (species_id, meta_key, meta_value) +VALUES (NULL, 'patch', 'patch_111_112_a.sql|schema_version'); diff --git a/sql/patch_112_113_a.sql b/sql/patch_112_113_a.sql new file mode 100644 index 000000000..f89e3f0b9 --- /dev/null +++ b/sql/patch_112_113_a.sql @@ -0,0 +1,27 @@ +-- Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +-- Copyright [2016-2024] EMBL-European Bioinformatics Institute +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +# patch_112_113_a.sql +# +# Title: Update schema version. +# +# Description: +# Update schema_version in meta table to 112. + +UPDATE meta SET meta_value='113' WHERE meta_key='schema_version'; + +# Patch identifier +INSERT INTO meta (species_id, meta_key, meta_value) +VALUES (NULL, 'patch', 'patch_112_113_a.sql|schema_version'); diff --git a/sql/patch_113_114_a.sql b/sql/patch_113_114_a.sql new file mode 100644 index 000000000..5637cb307 --- /dev/null +++ b/sql/patch_113_114_a.sql @@ -0,0 +1,27 @@ +-- Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +-- Copyright [2016-2024] EMBL-European Bioinformatics Institute +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +# patch_112_113_a.sql +# +# Title: Update schema version. +# +# Description: +# Update schema_version in meta table to 112. + +UPDATE meta SET meta_value='114' WHERE meta_key='schema_version'; + +# Patch identifier +INSERT INTO meta (species_id, meta_key, meta_value) +VALUES (NULL, 'patch', 'patch_113_114_a.sql|schema_version'); diff --git a/sql/table.sql b/sql/table.sql index a72e11ee5..19ce5692b 100644 --- a/sql/table.sql +++ b/sql/table.sql @@ -32,11 +32,11 @@ CREATE TABLE IF NOT EXISTS meta ( # Add schema type and schema version to the meta table INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'schema_type', 'production'), - (NULL, 'schema_version', 111); + (NULL, 'schema_version', 114); # Patches included in this schema file INSERT INTO meta (species_id, meta_key, meta_value) - VALUES (NULL, 'patch', 'patch_110_111_a.sql|schema version'); + VALUES (NULL, 'patch', 'patch_113_114_a.sql|schema version'); -- The 'master_biotype' table. -- Contains all the valid biotypes used for genes and transcripts. diff --git a/src/python/ensembl/common/Params.py b/src/python/ensembl/common/Params.py new file mode 100644 index 000000000..ef9371f99 --- /dev/null +++ b/src/python/ensembl/common/Params.py @@ -0,0 +1,233 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Params module to handle parameter manipulation between pipeline processes.""" + +import sys +import re +import json +import argparse + +sys.tracebacklimit = 0 + +class Params: + def __init__(self, params: dict=None, parse_dataflow_json: bool=True) -> None: + """ + Parameters + ---------- + params: dict, optional + The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) + parse_dataflow_json: bool, optional + Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + """ + if params: + self._params = params + else: + self._params = {} + self.parse_argv_params(parse_dataflow_json) + + def parse_argv_params(self, parse_dataflow_json: bool=True): + """Parses command-line arguments and extracts them into the Params object. + Command-line arguments need to be passed in the format "--name value". + + Parameters + ---------- + parse_dataflow_json: bool, optional + Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + """ + args = sys.argv[1:] + + # Extract param names from command line + r = re.compile(r"^--") + param_names = list(filter(r.match, args)) + + parser = argparse.ArgumentParser() + for name in param_names: + parser.add_argument(name) + + params = parser.parse_args() + for param_name in vars(params): + if param_name == 'dataflow' and parse_dataflow_json: + dataflow_params = json.loads(getattr(params, param_name)) + for name,value in dataflow_params.items(): + self.param(name, value) + else: + self.param(param_name, getattr(params, param_name)) + + def param(self, name: str, new_value=None, options: dict={}): + """ Gets or sets a parameter value. + + Parameters + ---------- + name: str + The name of the paramater + new_value: any, optional + The value to set the parameter to (default is None) + options: dict, optional + Extra options, including: + - default: The default value to use if parameter has no value (sets the parameter value to this) + - type: The type of the parameter value, used to check if value is valid + + Returns + ------- + The value of the parameter with provided name. + + Raises + ------ + AttributeError + If no parameter name was passed. + """ + if not name: + raise AttributeError('You must supply a parameter name') + + value = None + + if new_value is not None: + self._params[name] = new_value + value = new_value + else: + value = self._params.get(name) + if value is None and options.get('default') is not None: + default = options['default'] + self._params[name] = default + value = default + + if options.get('type'): + return self.check_type(name, value, options['type']) + + return value + + def param_required(self, name: str, options: dict={}): + """ Gets a parameter value, raising an error if no value is found. + + Parameters + ---------- + name: str + The name of th parameter + options: dict, optional + Extra options, including: + - default: The default value to use if parameter has no value (sets the parameter value to this) + - type: The type of the parameter value, used to check if value is valid + + Returns + ------- + The value of the parameter with provided name. + + Raises + ------ + AttributeError + If no value is found for the required paramater. + """ + value = self.param(name, None, options) + + if value is None: + raise AttributeError(f'Parameter \'{name}\' is required but has no value') + + return value + + def check_type(self, name: str, value, value_type: str): + """ Checks if the parameter value provided is valid. + For specific types, this function can change the parameter value. + + Parameters + ---------- + name: str + The name of the parameter + value: any + The value of the parameter + value_type: str + The type of the parameter value. Accepted types: + - hash, dict, or dictionary + - array or list + - int or integer + - bool or boolean + - str or string + + Returns + ------- + None if no value is found, or the new value of the parameter with provided name. + + Raises + ------ + AttributeError + If no parameter name is provided. + If parameter value is not valid. + """ + if not name: + raise AttributeError('You must supply a parameter name') + if value is None: + return + + value_type = value_type.lower() + error = 0 + new_value = None + + if value_type in ['hash', 'dict', 'dictionary'] and not isinstance(value, dict): + error = 1 + elif value_type in ['array', 'list'] and not isinstance(value, list): + # Try to split by commas + if re.search(",", value): + new_value = value.split(",") + else: + new_value = [value] + elif value_type in ['integer', 'int'] and not isinstance(value, int): + # Try to make it an integer + try: + new_value = int(value) + except ValueError: + error = 1 + elif value_type in ['bool', 'boolean'] and not isinstance(value, bool): + # Try to make it a boolean + if isinstance(value, int): + new_value = bool(value) + elif value in ['0', '1']: + new_value = bool(int(value)) + else: + error = 1 + elif value_type in ['str', 'string'] and not isinstance(value, str): + new_value = str(value) + + if error: + raise AttributeError(f'Parameter \'{name}\' has an invalid value \'{value}\'. Must be of type {value_type}') + + self.param(name, new_value) + return new_value + + def write_output(self, suffix: str, params: dict): + """ Appends data to the dataflow json file (passed into next pipeline process). + + Parameters + ---------- + suffix: str + The file suffix to add to the output file name (dataflow_[suffix].json) + params: dict + The data to append into the file + """ + # Remove null params + params = {k: v for k, v in params.items() if v is not None} + + with open(f'dataflow_{suffix}.json', 'a') as fh: + json.dump(params, fh) + fh.write("\n") + + def write_all_output(self, suffix: str): + """ Appends all of the parameters in the object into the dataflow json file. + This calls the write_output function. + + Parameters + ---------- + suffix: str + The file suffix to add to the output file name (dataflow_[suffix].json) + """ + self.write_output(suffix, self._params) diff --git a/src/python/ensembl/production/hive/ensembl_genome_metadata/MetadataUpdaterHiveCore.py b/src/python/ensembl/production/hive/ensembl_genome_metadata/MetadataUpdaterHiveCore.py index 6a37efffc..e6c2a9a9c 100644 --- a/src/python/ensembl/production/hive/ensembl_genome_metadata/MetadataUpdaterHiveCore.py +++ b/src/python/ensembl/production/hive/ensembl_genome_metadata/MetadataUpdaterHiveCore.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import json from ensembl.production.hive.BaseProdRunnable import BaseProdRunnable from ensembl.production.metadata.updater.core import CoreMetaUpdater @@ -18,11 +18,18 @@ class MetadataUpdaterHiveCore(BaseProdRunnable): def run(self): - if self.param("force") == 0 or self.param("force") is None: - run = CoreMetaUpdater(self.param("database_uri"), self.param("genome_metadata_uri"), self.param("taxonomy_uri")) - elif self.param("force") == 1: - run = CoreMetaUpdater(self.param("database_uri"), self.param("genome_metadata_uri"), self.param("taxonomy_uri"), - force=1) - else: - raise ValueError(f"Unable to figure out param {self.param('force')}") - run.process_core() + try: + run = CoreMetaUpdater(self.param("database_uri"), self.param("genome_metadata_uri")) + run.process_core() + output = { 'metadata_uri' : self.param("genome_metadata_uri"), + 'database_uri' : self.param("database_uri"), + 'email': self.param("email") + } + + self.dataflow({ + 'job_id' : self.input_job.dbID, + 'output' : json.dumps(output) + }, 2); + + except Exception as e : + raise ValueError(str(e)) diff --git a/src/python/ensembl/production/xrefs/Base.py b/src/python/ensembl/production/xrefs/Base.py new file mode 100644 index 000000000..fcf94dc42 --- /dev/null +++ b/src/python/ensembl/production/xrefs/Base.py @@ -0,0 +1,781 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base xref module to include all common functions used by xref modules.""" + +import re +import os +import shutil +import requests +import fnmatch +import gzip +import importlib +import wget +import threading +import json +import logging +import time +import random +import csv +import subprocess + +from sqlalchemy import create_engine, select, insert, update, text, func, and_ +from sqlalchemy.engine.url import make_url, URL +from sqlalchemy.engine import Connection +from sqlalchemy.orm import aliased +from sqlalchemy_utils import database_exists, create_database, drop_database +from urllib.parse import urlparse +from ftplib import FTP +from itertools import groupby +from configparser import ConfigParser +from datetime import datetime + +from ensembl.xrefs.xref_source_db_model import Base as XrefSourceDB, Source as SourceSORM, Version as VersionORM, ChecksumXref as ChecksumXrefSORM + +from ensembl.xrefs.xref_update_db_model import Base as XrefUpdateDB, Source as SourceUORM, SourceURL as SourceURLORM, Xref as XrefUORM, \ + PrimaryXref as PrimaryXrefORM, DependentXref as DependentXrefUORM, GeneDirectXref as GeneDirectXrefORM, TranscriptDirectXref as TranscriptDirectXrefORM, \ + TranslationDirectXref as TranslationDirectXrefORM, Synonym as SynonymORM, Pairs as PairsORM, Species as SpeciesORM, \ + SourceMappingMethod as SourceMappingMethodORM, MappingJobs as MappingJobsORM, Mapping as MappingORM + +from ensembl.core.models import Meta as MetaCORM, Gene as GeneORM, Transcript as TranscriptORM, Analysis as AnalysisORM, \ + ExonTranscript as ExonTranscriptORM, SupportingFeature as SupportingFeatureORM, DnaAlignFeature as DnaAlignFeatureORM, \ + TranscriptAttrib as TranscriptAttribORM, AttribType as AttribTypeORM, AnalysisDescription as AnalysisDescriptionORM, \ + SeqRegion as SeqRegionORM, SeqRegionAttrib as SeqRegionAttribORM, CoordSystem as CoordSystemORM, Translation as TranslationORM, \ + Exon as ExonORM, Xref as XrefCORM, DependentXref as DependentXrefCORM, ExternalDb as ExternalDbORM, Dna as DnaORM, ObjectXref as ObjectXrefCORM + +from ensembl.common.Params import Params + +class Base(Params): + """ Class to represent the base of xref modules. Inherits the Params class. + """ + def __init__(self, params: dict=None, parse_dataflow_json: bool=True) -> None: + """ Calls the parent __init__ then sets some specific parameters. + + Parameters + ---------- + params: dict, optional + The parameters to start the object with. If defined, command-line parameters won't be parsed (default is None) + parse_dataflow_json: bool, optional + Specifies whether to parse an option called 'dataflow' in the provided options (default is True) + """ + super().__init__(params, parse_dataflow_json) + + self.param('metasearch_url', "http://registry-grpc.ebi.ac.uk:8080/registry/metaSearch") + + # Initialize the logfile for this run + if self.param('log_timestamp'): + current_timestamp = self.param('log_timestamp') + else: + current_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + + log_path = os.path.join(self.param_required('base_path'), 'logs', current_timestamp) + if not os.path.exists(log_path): os.makedirs(log_path, exist_ok = True) + + log_file = os.path.join(log_path, 'tmp_logfile_'+self.__class__.__name__+'_'+str(random.randint(0, 5000))) + self._log_file = log_file + + console_handler = logging.StreamHandler() + file_handler = logging.FileHandler(log_file, mode='a') + console_handler.setLevel(logging.WARNING) + file_handler.setLevel(logging.DEBUG) + + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s | %(levelname)s | %(message)s', + datefmt='%d-%b-%Y %H:%M:%S', + handlers=[console_handler, file_handler] + ) + + def create_source_db(self, source_url: str, reuse_db_if_present: bool): + """ Creates the xref source database from model. + + Parameters + ---------- + source_url: str + The source database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] + reuse_db_if_present: bool + If set to False, the database defined by provided URL will be dropped before creating a new one + """ + url = make_url(source_url) + engine = create_engine(url, isolation_level="AUTOCOMMIT") + + if url.database and reuse_db_if_present: + return + + if database_exists(engine.url): + drop_database(engine.url) + create_database(engine.url) + XrefSourceDB.metadata.create_all(engine) + + def download_file(self, file: str, base_path: str, source_name: str, extra_args: dict): + """ Downloads an xref file and saves into provided space. + + Parameters + ---------- + file: str + The URL of the file to download. Acceptable URL schemes: ftp, http, and https + base_path: str + The path to save the downloaded file into + source_name: str + The xref source name + extra_args: dict + Extra options, including: + - skip_download_if_file_present: If set to True, file is only downloaded if does not exist + - db: The type of external db for the xref source (only relevent here if equal to 'checksum') + - release: If set to 'version', then this is a version file download + - rel_number: The URL used to retrieve the release number (only for RefSeq) + - catalog: The URL used to retrieve the release catalog (only for RefSeq) + + Returns + ------- + The path of the downloaded file. + + Raises + ------ + LookupError + If rel_number is provided but no release number was found in URL. + AttributeError + If file URL scheme is invalid. + """ + # Create uri object and get scheme + uri = urlparse(file) + if not uri.scheme: + return file + + # Get extra parameters + skip_download_if_file_present = extra_args.get('skip_download_if_file_present') or False + db = extra_args.get('db') + release = extra_args.get('release') + rel_number = extra_args.get('rel_number') + catalog = extra_args.get('catalog') + + # Create file download path + orig_source_name = source_name + source_name = re.sub(r"\/", "", source_name) + dest_dir = os.path.join(base_path, source_name) + if db and db == 'checksum': + dest_dir = os.path.join(base_path, 'Checksum') + if not os.path.exists(dest_dir): os.makedirs(dest_dir, exist_ok = True) + + file_path = "" + + # If file is in local ftp, copy from there + if re.search("ftp.ebi.ac.uk", file): + # Construct local path + local_file = file + local_file = re.sub("https://ftp.ebi.ac.uk/pub/", "/nfs/ftp/public/", local_file) + + # Check if local file exists + if os.path.exists(local_file): + file_path = os.path.join(dest_dir, os.path.basename(uri.path)) + if db and db == 'checksum': + file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}') + + if not (skip_download_if_file_present and os.path.exists(file_path)): + shutil.copy(local_file, file_path) + + # Check if copy was successful + if os.path.exists(file_path): + logging.info(f'{orig_source_name} file copied from local FTP: {file_path}') + if release: + return file_path + return os.path.dirname(file_path) + else: + logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') + + # Handle Refseq files + if re.search("RefSeq", source_name) and rel_number and catalog and not release: + # Get current release number + release_number = requests.get(rel_number).json() + if not release_number: + raise LookupError(f'No release number in {rel_number}') + + # Get list of files in release catalog + catalog = re.sub(r"\*", str(release_number), catalog) + files_list = requests.get(catalog).text + refseq_files = files_list.split("\n") + files_to_download = [] + + # Download each refseq file + for refseq_file in refseq_files: + if not refseq_file: continue + checksum, filename = refseq_file.split("\t") + + # Only interested in files matching pattern + if not fnmatch.fnmatch(filename, os.path.basename(uri.path)): continue + if re.search("nonredundant_protein", filename) or re.search("wp_protein", filename): continue + + file_path = os.path.join(dest_dir, os.path.basename(filename)) + if os.path.exists(file_path): + if skip_download_if_file_present: + logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') + continue + os.remove(file_path) + + file_url = os.path.join(os.path.dirname(file), filename) + files_to_download.append({'url': file_url, 'path': file_path}) + logging.info(f'{orig_source_name} file downloaded via HTTP: {file_path}') + + self.refseq_multithreading(files_to_download) + elif uri.scheme == 'ftp': + ftp = FTP(uri.netloc) + ftp.login('anonymous', '-anonymous@') + ftp.cwd(os.path.dirname(uri.path)) + remote_files = ftp.nlst() + + # Download files in ftp server + for remote_file in remote_files: + # Only interested in files matching pattern + if not fnmatch.fnmatch(remote_file, os.path.basename(uri.path)): continue + + remote_file = re.sub(r"\n", "", remote_file) + file_path = os.path.join(dest_dir, os.path.basename(remote_file)) + if db and db == 'checksum': + file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(remote_file)}') + + if not (skip_download_if_file_present and os.path.exists(file_path)): + ftp.retrbinary("RETR " + remote_file , open(file_path, 'wb').write) + logging.info(f'{orig_source_name} file downloaded via FTP: {file_path}') + else: + logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') + ftp.close() + elif uri.scheme == 'http' or uri.scheme == 'https': + # This is the case for the release file + if re.search("RefSeq", source_name) and rel_number and release: + # Get current release number + release_number = requests.get(rel_number).json() + if not release_number: + raise LookupError(f'No release number in {rel_number}') + + file = re.sub(r"\*", str(release_number), file) + uri = urlparse(file) + + file_path = os.path.join(dest_dir, os.path.basename(uri.path)) + if db and db == 'checksum': + file_path = os.path.join(dest_dir, f'{source_name}-{os.path.basename(uri.path)}') + + if not os.path.exists(file_path) or not skip_download_if_file_present: + if not skip_download_if_file_present and os.path.exists(file_path): + os.remove(file_path) + wget.download(file, file_path) + logging.info(f'{orig_source_name} file downloaded via HTTP: {file_path}') + else: + logging.info(f'{orig_source_name} file already exists, skipping download ({file_path})') + else: + raise AttributeError(f'Invalid URL scheme {uri.scheme}') + + if release: + return file_path + return os.path.dirname(file_path) + + def refseq_multithreading(self, files): + """ Creates multiple threads to download RefSeq files in parallel. + + Parameters + ---------- + files: list + The list of file URLs and paths to download. + """ + number_of_threads = 20 + chunk_size = int(len(files) / number_of_threads) + threads = [] + + for thread_index in range(number_of_threads): + array_start = thread_index * chunk_size + array_end = len(files) if thread_index+1 == number_of_threads else (thread_index+1) * chunk_size + + thread = threading.Thread(target=self.download_refseq_files, args=(files, array_start, array_end)) + threads.append(thread) + threads[thread_index].start() + + for thread in threads: + thread.join() + + def download_refseq_files(self, files, start: int, end: int): + """ Downloads RefSeq files from a subset of files. + + Parameters + ---------- + files: list + The list of file URLs and paths to download. + start: int + The start index of the files list. + end: int + The end index of the files list. + + Raises + ------ + Exception + If file download fails all attempts. + """ + for index in range(start, end): + failed = 0 + file_url = files[index]['url'] + local_path = files[index]['path'] + + for retry in range(0,3): + try: + wget.download(file_url, local_path) + except: + failed += 1 + continue + break + + if failed > 0: + raise Exception(f'Failed to download file {file_url}') + + def get_dbi(self, url: str): + """ Returns a DB connection for a provided URL. + + Parameters + ---------- + url: str + The database URL to connect to + + Returns + ------- + An sqlalchemy engine connection. + """ + connect_url = make_url(url) + engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") + + return engine.connect() + + def get_db_engine(self, url: str): + """ Returns a DB engine for a provided URL. + + Parameters + ---------- + url: str + The database URL to create an engine for + + Returns + ------- + An sqlalchemy engine. + """ + connect_url = make_url(url) + engine = create_engine(connect_url, isolation_level="AUTOCOMMIT") + + return engine + + def load_checksum(self, path: str, url: str): + """ Loads the xref checksum files into a provided database. + This first combines the checksum data from different xref sources into 1 file called checksum.txt before loading into the DB. + + Parameters + ---------- + path: str + The path where the checksum files can be found + url: str + The database URL to load the checksum data into + """ + checksum_dir = os.path.join(path, 'Checksum') + if not os.path.exists(checksum_dir): os.makedirs(checksum_dir, exist_ok = True) + + # Connect to db + url = url + "?local_infile=1" + db_engine = self.get_db_engine(url) + with db_engine.connect() as dbi: + counter = 1 + source_id = 1 + + # Open the checksum output file + files = os.listdir(checksum_dir) + checksum_file = os.path.join(checksum_dir, 'checksum.txt') + with open(checksum_file, 'w') as output_fh: + # Go through all available checksum files + for file in files: + if re.search("checksum", file): continue + + input_file = os.path.join(checksum_dir, file) + match = re.search(r"\/([A-Za-z]*)-.*$", input_file) + source_name = match.group(1) + source_id = self.get_source_id_from_name(dbi, source_name) + + input_fh = self.get_filehandle(input_file) + for line in input_fh: + line = line.rstrip() + (id, checksum) = re.split(r"\s+", line) + + counter += 1 + output = [str(counter), str(source_id), id, checksum] + output_str = "\t".join(output) + output_fh.write(f'{output_str}\n') + + input_fh.close() + + query = f'load data local infile \'{checksum_file}\' into table checksum_xref' + dbi.execute(text(query)) + + def get_filehandle(self, filename: str): + """ Opens an appropriate read filehandle for a file based on its type. + + Parameters + ---------- + filename: str + The name and path of the file to read + + Returns + ------- + A read filehandle. + + Raises + ------ + FileNotFoundError + If no file name was provided. + If provided file could not be found. + """ + if not filename or filename == '': + raise FileNotFoundError('No file name') + + alt_filename = filename + alt_filename = re.sub(r"\.(gz|Z)$", "", alt_filename) + if alt_filename == filename: + alt_filename = alt_filename + ".gz" + + if not os.path.exists(filename): + if not os.path.exists(alt_filename): + raise FileNotFoundError(f'Could not find either {filename} or {alt_filename}') + filename = alt_filename + + if re.search(r"\.(gz|Z)$", filename): + fh = gzip.open(filename, 'rt') + else: + fh = open(filename, 'r') + + return fh + + def get_source_id_from_name(self, dbi, source_name: str): + """ Retrieves a source ID from its name from a database. + + Parameters + ---------- + dbi: db connection + The database connection to query in + source_name: str + The name of the source + + Returns + ------- + The source ID. + """ + query = select(SourceSORM.source_id).where(SourceSORM.name==source_name) + source_id = dbi.execute(query).scalar() + + return source_id + + def get_file_sections(self, file: str, delimiter: str): + """ Reads a provided file by sections, separated by a provided delimiter. + This function uses 'yield' to provide the file sections one by one. + + Parameters + ---------- + file: str + The name and path of the file to read + delimiter: str + The character or string separating the file sections + + Returns + ------- + A yield of file sections. + """ + if re.search(r"\.(gz|Z)$", file): + with gzip.open(file, 'rt') as fh: + groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) + for key,group in groups: + yield list(group) + else: + with open(file, 'r') as fh: + groups = groupby(fh, key=lambda x: x.lstrip().startswith(delimiter)) + for key,group in groups: + yield list(group) + + def create_xref_db(self, url: str, config_file: str, preparse:bool): + """ Creates the xref database from model. + This function always drops the database defined by the provided URL (if it exists) before creating a new one. + + Parameters + ---------- + url: str + The database URL with format: [driver]://[user]:[password]@[host]:[port]/[dbname] + config_file: str + The name and path of the .ini file that has information about xref sources and species + preparse: bool + Specifies whether source preparsing will be done or not + """ + engine = create_engine(url, isolation_level="AUTOCOMMIT") + + # Drop database and create again + if database_exists(engine.url): + drop_database(engine.url) + create_database(engine.url) + XrefUpdateDB.metadata.create_all(engine) + + xref_dbi = engine.connect() + self.populate_xref_db(xref_dbi, config_file, preparse) + + def populate_xref_db(self, dbi, config_file:str, preparse:bool): + """ Populates the xref database with configuration data. + + Parameters + ---------- + dbi: db connection + The xref database connection + config_file: str + The name and path of the .ini file that has information about xref sources and species to populate the database with + preparse: bool + Specifies whether source preparsing will be done or not (needed to decide if to use old parsers) + + Raises + ------ + KeyError + If a source exists in a species section in the configuration file, but has no source section of its own. + """ + source_ids = {} + source_parsers = {} + species_sources = {} + + config = ConfigParser() + config.read(config_file) + + species_sections, sources_sections = {}, {} + + for section_name in config.sections(): + section = config[section_name] + (keyword, name) = re.split(r"\s+", section_name) + + if keyword == 'source': + sources_sections[name] = section + elif keyword == 'species': + species_sections[name] = section + + # Parse species sections + for species_name, section in species_sections.items(): + taxonomy_ids = section.get('taxonomy_id').split(",") + sources = section.get('sources') + aliases = section.get('aliases', species_name) + + species_id = taxonomy_ids[0] + + for tax_id in taxonomy_ids: + # Add new species + query = insert(SpeciesORM).values(species_id=species_id, taxonomy_id=tax_id, name=species_name, aliases=aliases) + dbi.execute(query) + + species_sources[species_id] = sources + + source_id = 0 + # Parse source sections + for source_name, section in sorted(sources_sections.items()): + source_id += 1 + source_name = section.get('name') + order = section.get('order') + priority = section.get('priority') + priority_description = section.get('prio_descr', '') + status = section.get('status', 'NOIDEA') + + old_parser = section.get('old_parser') + if old_parser and not preparse: + parser = old_parser + else: + parser = section.get('parser') + + # Add new source + query = insert(SourceUORM).values(name=source_name, source_release='1', ordered=order, priority=priority, priority_description=priority_description, status=status) + dbi.execute(query) + + source_ids[source_name] = source_id + source_parsers[source_id] = parser + + # Add source url rows + for species_id, sources in species_sources.items(): + source_names = sources.split(",") + + for source_name in source_names: + if not source_ids.get(source_name): + raise KeyError(f'No source section found for {source_name} in config file') + + source_id = source_ids[source_name] + parser = source_parsers[source_id] + query = insert(SourceURLORM).values(source_id=source_id, species_id=species_id, parser=parser) + dbi.execute(query) + + def get_source_id(self, dbi, parser: str, species_id: int, name: str, division_id: int): + """ Retrieves a source ID from its parser, species ID, name or division ID. + + Parameters + ---------- + dbi: db connection + The database connection to query in + parser: str + The source parser + species_id: int + The ID of the species related to the source + name: str + The source name + division_id: int + The ID of the division related to the source + + Returns + ------- + The source ID. + """ + name = "%"+name+"%" + source_id = None + + query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==species_id) + result = dbi.execute(query) + if result.rowcount == 1: + source_id = result.scalar() + + query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==species_id).filter(SourceUORM.name.like(name)) + result = dbi.execute(query) + if result.rowcount == 1: + source_id = result.scalar() + + if not source_id: + query = select(SourceURLORM.source_id).where(SourceUORM.source_id==SourceURLORM.source_id, SourceURLORM.parser==parser, SourceURLORM.species_id==division_id).filter(SourceUORM.name.like(name)) + result = dbi.execute(query).first() + if result: + source_id = result[0] + + return source_id + + def get_taxon_id(self, dbi): + """ Retrieves the species.taxonomy_id value of the meta table in a database. + + Parameters + ---------- + dbi: db connection + The database connection to query in + + Returns + ------- + The taxonomy ID in the database or 1 if not found. + """ + query = select(MetaCORM.meta_value).where(MetaCORM.meta_key=='species.taxonomy_id') + result = dbi.execute(query) + if result.rowcount > 0: + return result.scalar() + + return 1 + + def get_division_id(self, dbi): + """ Retrives the division ID from a database based on the species.division value of the meta table. + + Parameters + ---------- + dbi: db connection + The database connection to query in + + Returns + ------- + The division ID in the database or 1 if not found + """ + query = select(MetaCORM.meta_value).where(MetaCORM.meta_key=='species.division') + result = dbi.execute(query) + + if result.rowcount > 0: + division = result.scalar() + + division_taxon = { + 'Ensembl' : 7742, + 'EnsemblVertebrates' : 7742, + 'Vertebrates' : 7742, + 'EnsemblMetazoa' : 33208, + 'Metazoa' : 33208, + 'Plants' : 33090, + 'EnsemblPlants' : 33090, + } + + division_id = division_taxon.get(division) + if division_id: + return division_id + + return 1 + + def get_path(self, base_path: str, species: str, release: int, category: str, file_name: str=None): + """ Creates directories based on provided data. + + Parameters + ---------- + base_path: str + The base file path + species: str + The species name + release: int + The ensEMBL release number + category: str + The file category + file_name: str, optional + The file name + + Returns + ------- + A file path. + """ + full_path = os.path.join(base_path, species, release, category) + if not os.path.exists(full_path): + os.makedirs(full_path, exist_ok = True) + + if file_name: + return os.path.join(full_path, file_name) + else: + return full_path + + def get_db_from_registry(self, species: str, group: str, release: int, registry: str): + """ Looks up a db in the registry and returns an sqlaclehmy angine for it. + + Parameters + ---------- + species: str + The species name + group: str + The db group (core, ccds, otherfeatures, etc...) + release: int + The ensEMBL release number + registry: str + The registry url + + Returns + ------- + A db engine or 0 if no db is found. + """ + # Fix registry url, if needed + match = re.search(r"^(.*)://(.*)", registry) + if match: registry = match.group(2) + match = re.search(r"(.*)/(.*)", registry) + if match: registry = match.group(1) + + metasearch_url = self.param_required('metasearch_url') + metasearch_body = { + "name_pattern":f'{species}_{group}%', + "filters":[ + { + "meta_key":"schema_version", + "meta_value":release + }, + ], + "servers":[registry] + } + + dbs = requests.post(metasearch_url, json=metasearch_body).json() + dbs = dbs[registry] + + if len(dbs) > 0: + db_url = 'mysql://' + dbs[0] + return db_url + else: + return 0 + diff --git a/src/python/ensembl/production/xrefs/Checksum.py b/src/python/ensembl/production/xrefs/Checksum.py new file mode 100644 index 000000000..7ccb401a7 --- /dev/null +++ b/src/python/ensembl/production/xrefs/Checksum.py @@ -0,0 +1,46 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Checksum module for the Xref Download pipeline.""" + +from ensembl.production.xrefs.Base import * + +class Checksum(Base): + def run(self): + base_path = self.param_required('base_path') + source_db_url = self.param_required('source_db_url') + skip_download = self.param_required('skip_download', {'type': 'bool'}) + + logging.info('Checksum starting with parameters:') + logging.info(f'Param: base_path = {base_path}') + logging.info(f'Param: source_db_url = {source_db_url}') + logging.info(f'Param: skip_download = {skip_download}') + + # Connect to source db + db_engine = self.get_db_engine(source_db_url) + + # Check if checksums already exist + table_nonempty = 0 + if skip_download: + with db_engine.connect() as dbi: + query = select(func.count(ChecksumXrefSORM.checksum_xref_id)) + table_nonempty = dbi.execute(query).scalar() + + # Load checksums from files into db + if not table_nonempty: + self.load_checksum(base_path, source_db_url) + logging.info('Checksum data loaded') + else: + logging.info('Checksum data already exists, skipping loading') + diff --git a/src/python/ensembl/production/xrefs/DownloadSource.py b/src/python/ensembl/production/xrefs/DownloadSource.py new file mode 100644 index 000000000..060fcb116 --- /dev/null +++ b/src/python/ensembl/production/xrefs/DownloadSource.py @@ -0,0 +1,63 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Download module to download xref and version files.""" + +from ensembl.production.xrefs.Base import * + +class DownloadSource(Base): + def run(self): + base_path = self.param_required('base_path') + parser = self.param_required('parser') + name = self.param_required('name') + priority = self.param_required('priority') + source_db_url = self.param_required('source_db_url') + file = self.param_required('file') + skip_download = self.param_required('skip_download', {'type': 'bool'}) + db = self.param('db') + version_file = self.param('version_file') + preparse = self.param('preparse', None, {'type': 'bool'}) + rel_number = self.param('rel_number') + catalog = self.param('catalog') + + logging.info(f'DownloadSource starting for source {name}') + + # Download the main xref file + extra_args = {} + extra_args['skip_download_if_file_present'] = skip_download + extra_args['db'] = db + if rel_number and catalog: + extra_args['rel_number'] = rel_number + extra_args['catalog'] = catalog + file_name = self.download_file(file, base_path, name, extra_args) + + # Download the version file + version = "" + if version_file: + extra_args['release'] = 'version' + version = self.download_file(version_file, base_path, name, extra_args) + + # Update source db + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as dbi: + query = insert(SourceSORM).values(name=name, parser=parser).prefix_with('IGNORE') + dbi.execute(query) + + query = select(SourceSORM.source_id).where(SourceSORM.name==name) + source_id = dbi.execute(query).scalar() + + if preparse is None: preparse = False + query = insert(VersionORM).values(source_id=source_id, uri=file_name, index_uri=db, count_seen=priority, revision=version, preparse=preparse).prefix_with('IGNORE') + dbi.execute(query) + diff --git a/src/python/ensembl/production/xrefs/EmailNotification.py b/src/python/ensembl/production/xrefs/EmailNotification.py new file mode 100644 index 000000000..22738d990 --- /dev/null +++ b/src/python/ensembl/production/xrefs/EmailNotification.py @@ -0,0 +1,136 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Email module to send user emails notifying of xref pipelines end, with important information and statistics.""" + +from ensembl.production.xrefs.Base import * + +from smtplib import SMTP +from email.message import EmailMessage + +class EmailNotification(Base): + def run(self): + pipeline_name = self.param_required('pipeline_name') + base_path = self.param_required('base_path') + email_address = self.param_required('email') + email_server = self.param_required('email_server') + log_timestamp = self.param('log_timestamp') + + email_message = f'The {pipeline_name} has completed its run.
' + + if log_timestamp: + # Get the path of the log files + log_path = os.path.join(base_path, 'logs', log_timestamp) + + # Read the log file + if os.path.exists(log_path): + log_files = os.listdir(log_path) + + parameters, sources, added_species, skipped_species = {}, {}, {}, {} + + main_log_file = os.path.join(base_path, 'logs', log_timestamp, 'logfile_'+log_timestamp) + + # Copy different log files into a main one + with open(main_log_file, 'a') as out_fh: + for log_file in log_files: + if not re.search(r"^tmp_", log_file): continue + log_file = os.path.join(log_path, log_file) + with open(log_file) as in_fh: + log_data = in_fh.read() + out_fh.write(log_data) + os.remove(log_file) + + # Read the full logs + with open(main_log_file) as fh: + data = fh.read() + + # Extract parameter data + parameters_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Param: (\w+) = (.*)", data) + parameters = {param[0]: param[1] for param in parameters_list} + + email_message += '
The pipeline was run with the following parameters:
' + for param_name,param_value in parameters.items(): + email_message += f'{param_name} = {param_value}
' + + if re.search('Download', pipeline_name): + #Extract data from logs + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to download: ([\w\/]+)", data) + sources = {source : {'to_download' : 1} for source in sources_list} + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to cleanup: ([\w\/]+)", data) + for source in sources_list: sources[source].update({'to_cleanup' : 1}) + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source to preparse: ([\w\/]+)", data) + for source in sources_list: sources[source].update({'to_preparse' : 1}) + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) cleaned up", data) + for source in sources_list: sources[source].update({'cleaned_up' : 1}) + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| Source ([\w\/]+) preparsed", data) + for source in sources_list: sources[source].update({'preparsed' : 1}) + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file already exists, skipping download \((.*)\)", data) + for source in sources_list: sources[source[0]].update({'skipped' : os.path.dirname(source[1])}) + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file downloaded via (HTTP|FTP): (.*)", data) + for source in sources_list: sources[source[0]].update({'downloaded' : source[1]+"|"+os.path.dirname(source[2])}) + + sources_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| ([\w\/]+) file copied from local FTP: (.*)", data) + for source in sources_list: sources[source[0]].update({'copied' : os.path.dirname(source[1])}) + + skipped_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) skipped species = (\d+)", data) + skipped_species = {source[0]: source[1] for source in skipped_species_list} + + added_species_list = re.findall(r"^\d{2}-\w{3}-\d{4} \\| INFO \\| (\w+) species files created = (\d+)", data) + added_species = {source[0]: source[1] for source in added_species_list} + + # Include source statistics + email_message += '
--Source Statistics--
' + for source_name,source_values in sources.items(): + email_message += f'{source_name}:
' + if source_values.get('to_download'): email_message += '   Scheduled for download ✔
' + + if source_values.get('downloaded'): + (download_type, file_path) = source_values['downloaded'].split("|") + email_message += f'   File downloaded via {download_type} into {file_path}
' + elif source_values.get('copied'): email_message += '   File(s) copied from local FTP into %s
' % (source_values['copied']) + elif source_values.get('skipped'): email_message += '   File(s) download skipped, already exists in %s
' % (source_values['skipped']) + + if source_values.get('to_cleanup'): email_message += '   Scheduled for cleanup ✔
' + if source_values.get('cleaned_up'): email_message += '   Cleaned up ✔
' + + if source_values.get('to_preparse'): email_message += '   Scheduled for pre-parse ✔
' + if source_values.get('preparsed'): email_message += '   Pre-parsed ✔
' + + # Include species statistics + email_message += '
--Species Statistics--
' + email_message += 'Skipped Species (files already exist):
' + for source_name, count in skipped_species.items(): + email_message += f'   {source_name}: {count}
' + email_message += 'Added Species (files created):
' + for source_name, count in added_species.items(): + email_message += f'   {source_name}: {count}
' + + email_message += '
To run the Xref Process Pipeline based on the data from this pipeline, use the same --base_path, --source_db_url, and --central_db_url (if preparse was run) values provided to this pipeline.' + + # Send email + message = EmailMessage() + message['Subject'] = f'{pipeline_name} Finished' + message['From'] = email_address + message['To'] = email_address + message.set_content(email_message, 'html') + + smtp = SMTP(email_server) + smtp.send_message(message) + diff --git a/src/python/ensembl/production/xrefs/ScheduleCleanup.py b/src/python/ensembl/production/xrefs/ScheduleCleanup.py new file mode 100644 index 000000000..58396b33a --- /dev/null +++ b/src/python/ensembl/production/xrefs/ScheduleCleanup.py @@ -0,0 +1,57 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scheduling module to create cleanup jobs for specific xref sources.""" + +from ensembl.production.xrefs.Base import * + +class ScheduleCleanup(Base): + def run(self): + base_path = self.param_required('base_path') + source_db_url = self.param_required('source_db_url') + clean_files = self.param('clean_files') + clean_dir = self.param('clean_dir') + split_files_by_species = self.param('split_files_by_species') + + logging.info('ScheduleCleanup starting with parameters:') + logging.info(f'Param: base_path = {base_path}') + logging.info(f'Param: source_db_url = {source_db_url}') + logging.info(f'Param: clean_files = {clean_files}') + logging.info(f'Param: clean_dir = {clean_dir}') + logging.info(f'Param: split_files_by_species = {split_files_by_species}') + + # Connect to source db + db_engine = self.get_db_engine(source_db_url) + with db_engine.connect() as dbi: + # Get name and version file for each source + query = select(SourceSORM.name, VersionORM.revision).where(SourceSORM.source_id==VersionORM.source_id).distinct() + sources = dbi.execute(query).mappings().all() + + for source in sources: + # Only cleaning RefSeq and UniProt for now + if not (re.search(r"^RefSeq_(dna|peptide)", source.name) or re.search(r"^Uniprot", source.name)): continue + + # Remove / char from source name to access directory + clean_name = source.name + clean_name = re.sub(r"\/", "", clean_name) + + # Send parameters into cleanup jobs for each source + if os.path.exists(os.path.join(base_path, clean_name)): + logging.info(f'Source to cleanup: {source.name}') + + self.write_output('cleanup_sources', { + 'name' : source.name, + 'version_file' : source.revision + }) + diff --git a/src/python/ensembl/production/xrefs/ScheduleDownload.py b/src/python/ensembl/production/xrefs/ScheduleDownload.py new file mode 100644 index 000000000..8001bccc8 --- /dev/null +++ b/src/python/ensembl/production/xrefs/ScheduleDownload.py @@ -0,0 +1,73 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Scheduling module to create download jobs for all xref sources in config file.""" + +from ensembl.production.xrefs.Base import * + +class ScheduleDownload(Base): + def run(self): + config_file = self.param_required('config_file') + source_db_url = self.param_required('source_db_url') + reuse_db = self.param_required('reuse_db', {'type': 'bool'}) + skip_preparse = self.param('skip_preparse', None, {'type': 'bool', 'default' : False}) + + logging.info('ScheduleDownload starting with parameters:') + logging.info(f'Param: config_file = {config_file}') + logging.info(f'Param: source_db_url = {source_db_url}') + logging.info(f'Param: reuse_db = {reuse_db}') + logging.info(f'Param: skip_preparse = {skip_preparse}') + + # Create the source db from url + self.create_source_db(source_db_url, reuse_db) + + # Extract sources to download from config file + sources = [] + with open(config_file) as conf_file: + sources = json.load(conf_file) + + if len(sources) < 1: + raise IOError(f'No sources found in config file {config_file}. Need sources to run pipeline') + + for source_data in sources: + name = source_data['name'] + parser = source_data['parser'] + priority = source_data['priority'] + file = source_data['file'] + db = source_data.get('db') + version_file = source_data.get('release') + preparse = source_data.get('preparse') + rel_number = source_data.get('release_number') + catalog = source_data.get('catalog') + + logging.info(f'Source to download: {name}') + + # Revert to the old parser if not pre-parsing + if preparse and skip_preparse: + parser = source_data['old_parser'] + preparse = 0 + + # Pass the source parameters into download jobs + self.write_output('sources', { + 'parser' : parser, + 'name' : name, + 'priority' : priority, + 'db' : db, + 'version_file' : version_file, + 'preparse' : preparse, + 'file' : file, + 'rel_number' : rel_number, + 'catalog' : catalog + }) + diff --git a/src/python/ensembl/production/xrefs/config/xref_all_sources.json b/src/python/ensembl/production/xrefs/config/xref_all_sources.json new file mode 100644 index 000000000..a89f40d4a --- /dev/null +++ b/src/python/ensembl/production/xrefs/config/xref_all_sources.json @@ -0,0 +1,248 @@ +[ + { + "name" : "ArrayExpress", + "parser" : "ArrayExpressParser", + "file" : "Database", + "db" : "core", + "priority" : 1 + }, + { + "name" : "CCDS", + "parser" : "CCDSParser", + "file" : "Database", + "db" : "ccds", + "priority" : 1 + }, + { + "name" : "UniParc", + "parser" : "ChecksumParser", + "file" : "https://ftp.ebi.ac.uk/pub/contrib/uniparc/upidump.lis.gz", + "db" : "checksum", + "priority" : 1 + }, + { + "name" : "RNACentral", + "parser" : "ChecksumParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/md5/md5.tsv.gz", + "db" : "checksum", + "priority" : 1 + }, + { + "name" : "DBASS3", + "parser" : "DBASSParser", + "file" : "https://www.dbass.soton.ac.uk/Dbass3/DownloadCsv", + "priority" : 1 + }, + { + "name" : "DBASS5", + "parser" : "DBASSParser", + "file" : "https://www.dbass.soton.ac.uk/Dbass5/DownloadCsv", + "priority" : 1 + }, + { + "name" : "EntrezGene", + "parser" : "EntrezGeneParser", + "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", + "priority" : 1 + }, + { + "name" : "HPA", + "parser" : "HPAParser", + "file" : "https://www.proteinatlas.org/download/xref.php", + "priority" : 1 + }, + { + "name" : "MGI", + "parser" : "MGIParser", + "file" : "https://www.informatics.jax.org/downloads/reports/MRK_ENSEMBL.rpt", + "priority" : 2 + }, + { + "name" : "MGI_desc", + "parser" : "MGI_Desc_Parser", + "file" : "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt", + "priority" : 1 + }, + { + "name" : "MGI_ccds", + "parser" : "MGI_CCDS_Parser", + "file" : "https://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_mouse/CCDS.current.txt", + "priority" : 2 + }, + { + "name" : "MIM2GENE", + "parser" : "Mim2GeneParser", + "file" : "https://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen", + "priority" : 3 + }, + { + "name" : "MIM", + "parser" : "MIMParser", + "file" : "https://data.omim.org/downloads/ZpPlmgwjuTBK9T5vf2sFjA/omim.txt.gz", + "priority" : 2 + }, + { + "name" : "RFAM", + "parser" : "RFAMParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.seed.gz", + "db" : "core", + "priority" : 1 + }, + { + "name" : "RGD", + "parser" : "RGDParser", + "file" : "https://download.rgd.mcw.edu/pub/data_release/GENES_RAT.txt", + "priority" : 2 + }, + { + "name" : "Reactome", + "parser" : "ReactomeParser", + "file" : "https://www.reactome.org/download/current/Ensembl2Reactome_All_Levels.txt", + "release" : "https://www.reactome.org/ReactomeRESTfulAPI/RESTfulWS/version", + "priority" : 1 + }, + { + "name" : "Reactome", + "parser" : "ReactomeParser", + "file" : "https://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt", + "release" : "https://www.reactome.org/ReactomeRESTfulAPI/RESTfulWS/version", + "priority" : 2 + }, + { + "name" : "RefSeq_dna", + "parser" : "RefSeqDatabaseParser", + "old_parser" : "RefSeqGPFFParser", + "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*rna.gbff.gz", + "method" : "--bestn 5", + "query_cutoff" : 90, + "target_cutoff" : 90, + "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", + "preparse" : 1, + "priority" : 2, + "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER", + "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release*.files.installed" + }, + { + "name" : "RefSeq_peptide", + "parser" : "RefSeqDatabaseParser", + "old_parser" : "RefSeqGPFFParser", + "file" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/complete.*.protein.gpff.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "release" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-notes/RefSeq-release*.txt", + "preparse" : 1, + "priority" : 3, + "release_number" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER", + "catalog" : "https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release*.files.installed" + }, + { + "name" : "Refseq_import", + "parser" : "RefSeqCoordinateParser", + "file" : "Database", + "db" : "otherfeatures", + "priority" : 2 + }, + { + "name" : "UCSC_hg38", + "parser" : "UCSCParser", + "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/knownGene.txt.gz", + "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/README.txt", + "priority" : 1 + }, + { + "name" : "UCSC_mm10", + "parser" : "UCSCParser", + "file" : "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/knownGene.txt.gz", + "release" : "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/README.txt", + "priority" : 1 + }, + { + "name" : "Uniprot/SWISSPROT", + "parser" : "UniProtDatabaseParser", + "old_parser" : "UniProtParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "preparse" : 1, + "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt", + "priority" : 1 + }, + { + "name" : "Uniprot/SPTREMBL", + "parser" : "UniProtDatabaseParser", + "old_parser" : "UniProtParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_trembl.dat.gz", + "method" : "--bestn 1", + "query_cutoff" : 100, + "target_cutoff" : 100, + "preparse" : 1, + "release" : "https://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/reldate.txt", + "priority" : 1 + }, + { + "name" : "VGNC", + "parser" : "VGNCParser", + "file" : "https://ftp.ebi.ac.uk/pub/databases/genenames/vgnc/tsv/vgnc_gene_set_All.txt.gz", + "priority" : 1 + }, + { + "name" : "ZFIN_ID", + "parser" : "ZFINParser", + "file" : "https://zfin.org/data_transfer/Downloads/refseq.txt", + "priority" : 3 + }, + { + "name" : "ZFIN_ID", + "parser" : "ZFINParser", + "file" : "https://zfin.org/data_transfer/Downloads/uniprot.txt", + "priority" : 2 + }, + { + "name" : "ZFIN_ID", + "parser" : "ZFINParser", + "file" : "https://zfin.org/data_transfer/Downloads/aliases.txt", + "priority" : 2 + }, + { + "name" : "ZFIN_ID", + "parser" : "ZFINParser", + "file" : "https://zfin.org/data_transfer/Downloads/gene_seq.txt", + "priority" : 1 + }, + { + "name" : "ZFIN_desc", + "parser" : "ZFINDescParser", + "file" : "ftp://zfin.org/pub/transfer/MEOW/zfin_genes.txt", + "priority" : 1 + }, + { + "name" : "cint_jgi_v1", + "parser" : "JGI_ProteinParser", + "file" : "https://ftp.ensembl.org/pub/misc/cint_jgi/v1/ciona.prot.fasta.gz", + "priority" : 1 + }, + { + "name" : "Xenbase", + "parser" : "XenopusJamboreeParser", + "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt", + "priority" : 1 + }, + { + "name" : "miRBase", + "parser" : "miRBaseParser", + "file" : "https://mirbase.org/download/miRNA.dat", + "method" : "--bestn 1", + "query_cutoff" : 90, + "target_cutoff" : 90, + "priority" : 1 + }, + { + "name" : "HGNC", + "parser" : "HGNCParser", + "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit", + "db" : "ccds", + "priority" : 3 + } +] diff --git a/src/python/ensembl/production/xrefs/config/xref_config.ini b/src/python/ensembl/production/xrefs/config/xref_config.ini new file mode 100644 index 000000000..5a4830d52 --- /dev/null +++ b/src/python/ensembl/production/xrefs/config/xref_config.ini @@ -0,0 +1,1680 @@ +########################################################################## +# SOURCES # +# # +# Keys: # +# name - name of this source (required) # +# order - parsing order for this source (required) # +# priority - priority of these data files when more files belong # +# to the same source 'name' (required) # +# prio_descr - label for the 'priority' (optional) # +# parser - the parser to be used (required) # +# dependent_on - Comma separated list of sources which must be loaded # +# first (optional) # +# Note that if species does not have xrefs from a # +# master source specified in this list than the # +# dependency is ignored # +# # +########################################################################## + +[source EC_NUMBER::saccharomyces_cerevisiae] +# Used by S.cerevisiae +name = EC_NUMBER +order = 50 +priority = 70 +parser = UniProtParser + +[source BioGRID::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = BioGRID +order = 50 +priority = 1 +parser = FlybaseParser + +[source EPD::drosophila_melanogaster] +# Used by the 12 drosophila genomes +name = EPD +order = 50 +priority = 1 +parser = FlybaseParser + +[source FlyExpress::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyExpress +order = 50 +priority = 1 +parser = FlybaseParser + +[source FlyReactome::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyReactome +order = 50 +priority = 1 +parser = FlybaseParser + +[source GenomeRNAi::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = GenomeRNAi +order = 50 +priority = 1 +parser = FlybaseParser + +[source InteractiveFly::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = InteractiveFly +order = 50 +priority = 1 +parser = FlybaseParser + +[source miRBase::drosophila_melanogaster] +# Used by the 12 drosophila genomes +name = miRBase +order = 50 +priority = 1 +parser = FlybaseParser + +[source MitoDrome::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = MitoDrome +order = 50 +priority = 1 +parser = FlybaseParser + +[source TransFac::drosophila_melanogaster] +# Used by the 12 drosophila genomes +name = TransFac +order = 50 +priority = 1 +parser = FlybaseParser + +[source TransFac::drosophila_pseudoobscura] +# Used by the 12 drosophila genomes +name = TransFac +order = 50 +priority = 1 +parser = FlybaseParser + +[source flybase_annotation_id::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = flybase_annotation_id +order = 50 +priority = 1 +prio_descr = Annotation ID assigned by FlyBase +parser = FlybaseParser + +[source flybase_gene_id::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = flybase_gene_id +order = 75 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_pseudoobscura] +# Used by drosophila_pseudoobscura +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_grimshawi] +# Used by drosophila_grimshawi +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_willistoni] +# Used by drosophila_willistoni +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_ananassae] +# Used by drosophila_ananassae +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_yakuba] +# Used by drosophila_yakuba +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_simulans] +# Used by drosophila_simulans +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_sechellia] +# Used by drosophila_sechellia +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_erecta] +# Used by drosophila_erecta +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_persimilis] +# Used by drosophila_persimilis +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_mojavensis] +# Used by drosophila_mojavensis +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_gene_id::drosophila_virilis] +# Used by drosophila_virilis +name = flybase_gene_id +order = 100 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser +dependent_on = Uniprot/SPTREMBL,Uniprot/SWISSPROT + +[source flybase_transcript_id::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = flybase_transcript_id +order = 50 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser + +[source flybase_translation_id::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = flybase_translation_id +order = 50 +priority = 1 +prio_descr = ID assigned by FlyBase +parser = FlybaseParser + +[source FlyBaseCGID_gene::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyBaseCGID_gene +order = 50 +priority = 1 +prio_descr = FlyBase_Annotation_IDs +parser = FlybaseParser + +[source FlyBaseCGID_transcript::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyBaseCGID_transcript +order = 50 +priority = 1 +prio_descr = FlyBase_Annotation_IDs +parser = FlybaseParser + +[source FlyBaseCGID_translation::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyBaseCGID_translation +order = 50 +priority = 1 +prio_descr = FlyBase_Annotation_IDs +parser = FlybaseParser + +[source FlyBaseName_gene::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyBaseName_gene +order = 50 +priority = 1 +prio_descr = Name assigned to gene in FlyBase gff +parser = FlybaseParser + +[source FlyBaseName_transcript::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyBaseName_transcript +order = 50 +priority = 1 +prio_descr = Transcript name in FlyBase gff +parser = FlybaseParser + +[source FlyBaseName_translations::drosophila_melanogaster] +# Used by drosophila_melanogaster +name = FlyBaseName_translation +order = 50 +priority = 1 +prio_descr = Translation name in FlyBase gff +parser = FlybaseParser + +[source PHIbase::MULTI] +name = PHIbase +order = 50 +priority = 1 +parser = PHIbaseParser +dependent_on = Uniprot/SWISSPROT,Uniprot/SPTREMBL + +[source ArrayExpress::MULTI] +# Used by all ensembl species +name = ArrayExpress +order = 50 +priority = 1 +parser = ArrayExpressParser + +[source ArrayExpress::EG] +name = ArrayExpress +order = 50 +priority = 1 +parser = ArrayExpressParser + +[source CCDS::homo_sapiens] +# Used by homo_sapiens +name = CCDS +order = 10 +priority = 1 +parser = CCDSParser + +[source CCDS::mus_musculus] +# Used by mus_musculus +name = CCDS +order = 10 +priority = 1 +parser = CCDSParser + +[source DBASS5::homo_sapiens] +# Used by homo_sapiens +name = DBASS5 +order = 50 +priority = 1 +prio_descr = Database of aberrant 5\' splice sites. +parser = DBASSParser + +[source DBASS3::homo_sapiens] +# Used by homo_sapiens +name = DBASS3 +order = 50 +priority = 1 +prio_descr = Database of aberrant 3\' splice sites. +parser = DBASSParser + +[source EntrezGene::MULTI] +# Used by aedes_aegypti, anolis_carolinensis, anopheles_gambiae, acyrthosiphon_pisum, apis_mellifera, bos_taurus, caenorhabditis_elegans, canis_familiaris, cavia_porcellus, ciona_intestinalis, ciona_savignyi, danio_rerio, dasypus_novemcinctus, drosophila_melanogaster, drosophila_pseudoobscura, echinops_telfairi, erinaceus_europaeus, equus_caballus, felis_catus, ficedula_albicollis, gallus_gallus, gasterosteus_aculeatus, gorilla_gorilla, homo_sapiens, ixodes_scappularis, loxodonta_africana, macaca_mulatta, monodelphis_domestica, mus_musculus, myotis_lucifugus, ochotona_princeps, oryctolagus_cuniculus, oryzias_latipes, pan_troglodytes, pongo_abelii, rattus_norvegicus, saccharomyces_cerevisiae, ictidomys_tridecemlineatus, sus_scrofa, taeniopygia_guttata, takifugu_rubripes, tupaia_belangeri, xenopus_tropicalis,phaeodactylum_tricornutum,thalassiosira_pseudonana, lepisosteus_oculatus +name = EntrezGene +order = 10 +priority = 1 +parser = EntrezGeneParser + +[source EntrezGene_trans_name] +# Used by homo_sapiens,mus_musculus,danio_rerio,sus_scrofa +name = EntrezGene_trans_name +order = 70 +priority = 1 +parser = comes via official naming + +[source WikiGene::MULTI] +# used via the EntrezGeneParser, for all species +name = WikiGene +order = 100 +priority = 1 +parser = EntrezGeneParser + +[source HPA::homo_sapiens] +# Used by homo_sapiens +name = HPA +order = 50 +priority = 1 +prio_descr = Human Protein Atlas (HPA) database +parser = HPAParser + +[source LRG_HGNC_notransfer] +name = LRG_HGNC_notransfer +order = 30 +priority = 5 +parser = HGNCParser + +[source VGNC::vertebrate] +name = VGNC +order = 29 +priority = 1 +parser = VGNCParser + +[source HGNC::homo_sapiens#07] +# used by #02 +name = HGNC +order = 30 +priority = 1 +prio_descr = ensembl_manual +parser = HGNCParser + +[source HGNC::homo_sapiens#01] +# Used by homo_sapiens +name = HGNC +order = 30 +priority = 2 +prio_descr = ccds +parser = HGNCParser + + +[source HGNC::homo_sapiens#02] +# Used by homo_sapiens +name = HGNC +order = 29 +priority = 4 +prio_descr = entrezgene_manual +parser = HGNCParser +dependent_on = EntrezGene,Uniprot/SWISSPROT,RefSeq_dna,RefSeq_peptide + + +[source HGNC::homo_sapiens#03] +# Used by #02 +name = HGNC +order = 30 +priority = 5 +prio_descr = refseq_manual +parser = HGNCParser + +[source HGNC::homo_sapiens#08] +# used by #02 +name = HGNC +order = 30 +priority = 100 +prio_descr = desc_only +parser = HGNCParser + +[source MIM_GENE::homo_sapiens] +# MIM parse loads data as MIM_GENE or MIM_MORBID not as MIM +name = MIM_GENE +order = 40 +priority = 1 +parser = MIMParser + +[source MIM_MORBID::homo_sapiens] +# MIM parse loads data as MIM_GENE or MIM_MORBID not as MIM +name = MIM_MORBID +order = 40 +priority = 1 +parser = MIMParser + +[source MIM::homo_sapiens] +# Used by homo_sapiens +name = MIM +order = 10 +priority = 1 +parser = MIMParser + +[source MIM2GENE::homo_sapiens] +# Used by homo_sapiens +name = MIM2GENE +order = 60 +priority = 1 +parser = Mim2GeneParser +dependent_on = MIM,EntrezGene + +[source GeneCards::homo_sapiens] +# used via the HGNCParser, for homo_sapiens +name = GeneCards +order = 100 +priority = 1 +parser = HGNCParser + +[source MGI::mus_musculus#01] +# Used by mus_musculus +name = MGI +order = 30 +priority = 1 +prio_descr = official +parser = MGIParser + +[source MGI::mus_musculus#05] +# Used by mus_musculus +name = MGI +order = 1 +priority = 10 +prio_descr = descriptions +parser = MGI_Desc_Parser + +[source Reactome::MULTI] +# Used by all species +name = Reactome +order = 80 +priority = 1 +prio_descr = direct +parser = ReactomeParser + +[source Reactome_transcript::MULTI] +# Species source used in ReactomeParser. No species uses this source +name = Reactome_transcript +order = 20 +priority = 1 +prio_descr = transcript +parser = ReactomeParser + +[source Reactome_gene::MULTI] +# Species source used in ReactomeParser. No species uses this source +name = Reactome_gene +order = 20 +priority = 1 +prio_descr = gene +parser = ReactomeParser + +[source Reactome::MULTI-Uniprot] +# Special source used in ReactomeParser. No species uses this source. +name = Reactome +order = 20 +priority = 1 +prio_descr = uniprot +parser = ReactomeParser + +[source RGD::rattus_norvegicus] +# Used by rattus_norvegicus +name = RGD +order = 30 +priority = 2 +parser = RGDParser +dependent_on = RefSeq_dna,RefSeq_peptide + +[source RGD::rattus_norvegicus#02] +# Used by rattus_norvegicus +name = RGD +order = 30 +priority = 1 +prio_descr = direct_xref +parser = done_in_RGDParser + +[source RGD_trans_name] +name = RGD_trans_name +order = 49 +priority = 1 +parser = done_in_official_naming + +[source RefSeq_dna::MULTI-vertebrate] +# Used by vertebrates +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqDatabaseParser +old_parser = RefSeqGPFFParser + +[source RefSeq_dna::gencode] +# Used by human and mouse +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqGPFFParser + +[source RefSeq_dna::MULTI-fungi] +# Used by saccharomyces_cerevisiae +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_dna::MULTI-Plants] +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqDatabaseParser +old_parser = RefSeqGPFFParser + +[source RefSeq_dna::MULTI-complete] +# Used by phaeodactylum_tricornutum +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_dna::MULTI-protozoa] +# Used by dictyostelium_discoideum +name = RefSeq_dna +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_dna::MULTI-invertebrate] +# Used by drosophila_melanogaster +name = RefSeq_dna +order = 15 +priority = 1 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_dna::MULTI-predicted] +# Special source used in RefSeqParser. No species uses this source. +name = RefSeq_dna_predicted +order = 20 +priority = 1 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_mRNA::MULTI] +# Special source used in RefSeqParser. No species uses this source. +# dependent source is used to provide a link between RefSeq_dna source in SubmitMapper +name = RefSeq_mRNA +order = 15 +priority = 3 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_mRNA::otherfeatures] +# Special source used in RefSeqCoordinateParser. No species uses this source. +name = RefSeq_mRNA +order = 15 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_peptide::otherfeatures] +# Special source used in RefSeqCoordinateParser. No species uses this source. +name = RefSeq_peptide +order = 15 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_mRNA_predicted::otherfeatures] +# Special source used in RefSeqCoordinateParser. No species uses this source. +name = RefSeq_mRNA_predicted +order = 20 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_peptide_predicted::otherfeatures] +# Special source used in RefSeqCoordinateParser. No species uses this source. +name = RefSeq_peptide_predicted +order = 20 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_import::otherfeatures] +# Import RefSeq models from otherfeatures database +# Used for human and mouse +name = RefSeq_import +order = 20 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_ncRNA::MULTI] +# Special source used in RefSeqParser. No species uses this source. +name = RefSeq_ncRNA +order = 15 +priority = 2 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_ncRNA::otherfeatures] +# Special source used in RefSeqCoordinateParser. No species uses this source. +name = RefSeq_ncRNA +order = 15 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_ncRNA_predicted::otherfeatures] +# Special source used in RefSeqCoordinateParser. No species uses this source. +name = RefSeq_ncRNA_predicted +order = 20 +priority = 1 +prio_descr = otherfeatures +parser = RefSeqCoordinateParser + +[source RefSeq_mRNA_predicted::MULTI] +# Special source used in RefSeqParser. No species uses this source. +name = RefSeq_mRNA_predicted +order = 20 +priority = 2 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_ncRNA_predicted::MULTI] +# Special source used in RefSeqParser. No species uses this source. +name = RefSeq_ncRNA_predicted +order = 20 +priority = 1 +prio_descr = refseq +parser = RefSeqParser + +[source RefSeq_peptide::MULTI] +name = RefSeq_peptide +order = 30 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide::gencode] +name = RefSeq_peptide +order = 30 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide::MULTI-fungi] +# Used by saccharomyces_cerevisiae +name = RefSeq_peptide +order = 25 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide::MULTI-Plants] +name = RefSeq_peptide +order = 25 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide::MULTI-complete] +# Used by phaeodactylum_tricornutum +name = RefSeq_peptide +order = 25 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide::MULTI-protozoa] +# Used by dictyostelium_discoideum +name = RefSeq_peptide +order = 25 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide::MULTI-invertebrate] +# Used by caenorhabditis_elegans, ciona_savignyi, drosophila_melanogaster +name = RefSeq_peptide +order = 25 +priority = 2 +parser = RefSeqGPFFParser + +[source RefSeq_peptide_predicted::MULTI] +# Special source used in RefSeqGPFFParser. No species uses this source. +name = RefSeq_peptide_predicted +order = 30 +priority = 2 +prio_descr = refseq +parser = RefSeqGPFFParser + +[source RefSeq_peptide::MULTI-vertebrate] +# Used by vertebrates +name = RefSeq_peptide +order = 25 +priority = 2 +prio_descr = refseq +parser = RefSeqDatabaseParser +old_parser = RefSeqGPFFParser + +[source SGD_GENE::saccharomyces_cerevisiae] +# Used by saccharomyces_cerevisiae +name = SGD_GENE +order = 10 +priority = 1 +parser = SGDParser + +[source SGD_TRANSLATION::saccharomyces_cerevisiae] +# Used by saccharomyces_cerevisiae +name = SGD_TRANSLATION +order = 10 +priority = 1 +parser = SGDParser + +[source SGD::saccharomyces_cerevisiae] +# Used by saccharomyces_cerevisiae +name = SGD +order = 10 +priority = 1 +parser = SGDParser + +[source PomBase_GENE::schizosaccharomyces_pombe] +# Used by schizosaccharomyces_pombe +name = PomBase_GENE +order = 10 +priority = 1 +parser = PomBaseParser + +[source PomBase_TRANSCRIPT::schizosaccharomyces_pombe] +# Used by schizosaccharomyces_pombe +name = PomBase_TRANSCRIPT +order = 10 +priority = 1 +parser = PomBaseParser + +[source PomBase::schizosaccharomyces_pombe] +# Used by schizosaccharomyces_pombe +name = PomBase +order = 10 +priority = 1 +parser = PomBaseParser + +[source PGSC_GENE::solanum_tuberosum] +# Used by solanum_tuberosum +name = PGSC_GENE +order = 10 +priority = 1 +parser = PGSCParser + +[source PHYTOZOME_GMAX_GENE::glycine_max] +# Used by glycine_max +name = PHYTOZOME_GMAX_GENE +order = 10 +priority = 1 +parser = PhytozomeGmaxParser + +[source UCSC::MULTI] +# Special source used in UCSCParser. No species uses this source. +name = UCSC +order = 70 +priority = 1 +prio_descr = +parser = UCSCParser + +[source UCSC::homo_sapiens] +# Used by homo_sapiens +name = UCSC_hg38 +order = 70 +priority = 1 +parser = UCSC_human_parser + +[source UCSC::mus_musculus] +# Used by mus_musculus +name = UCSC_mm10 +order = 70 +priority = 1 +parser = UCSC_mouse_parser + +[source Uniprot/SPTREMBL::MULTI-invertebrate] +name = Uniprot/SPTREMBL +order = 20 +priority = 3 +parser = UniProtParser +dependent_on = MIM + +[source Uniprot/SPTREMBL::MULTI] +# Used by vertebrates +name = Uniprot/SPTREMBL +order = 20 +priority = 3 +prio_descr = sequence_mapped +parser = UniProtDatabaseParser +old_parser = UniProtParser +dependent_on = MIM + +[source Uniprot/SPTREMBL::gencode] +# Used by human and mouse +name = Uniprot/SPTREMBL +order = 20 +priority = 3 +prio_descr = sequence_mapped +parser = UniProtParser +dependent_on = MIM + +[source Uniprot/SPTREMBL::MULTI-evidence_gt_2] +# Additional source for entires with evidence at protein level > 2 (numerically) for Uniprot/SPTREMBL::MULTI +# These are not taken into account when deriving display xrefs or assigning gene status +name = Uniprot/SPTREMBL +order = 20 +priority = 10 +prio_descr = protein_evidence_gt_2 +parser = UniProtParser +status = LOWEVIDENCE + +[source Uniprot/SWISSPROT::MULTI] +# Used by vertebrates +name = Uniprot/SWISSPROT +order = 20 +priority = 3 +prio_descr = sequence_mapped +parser = UniProtDatabaseParser +old_parser = UniProtParser +dependent_on = MIM + +[source Uniprot/SWISSPROT::gencode] +# Used by human and mouse +name = Uniprot/SWISSPROT +order = 20 +priority = 3 +prio_descr = sequence_mapped +parser = UniProtParser +dependent_on = MIM + +[source Uniprot/SWISSPROT::MULTI-invertebrate] +name = Uniprot/SWISSPROT +order = 20 +priority = 3 +prio_descr = sequence_mapped +parser = UniProtParser +dependent_on = MIM + +[source Uniprot/SWISSPROT::DIRECT] +# Special source used in UniProtParser for direct mappings from Uniprot +name = Uniprot/SWISSPROT +order = 22 +priority = 1 +prio_descr = direct +parser = UniProtParser + +[source Uniprot/SPTREMBL::DIRECT] +# Special source used in UniProtParser for direct mappings from Uniprot +name = Uniprot/SPTREMBL +order = 22 +priority = 1 +prio_descr = direct +parser = UniProtParser + +[source Uniprot_gn] +# Special source used in UniProtParser foir gene names.. +name = Uniprot_gn +order = 20 +priority = 1 +parser = UniProtParser + +[source Uniprot_isoform] +# Special source used in UniProtParser for protein isoforms +name = Uniprot_isoform +order = 30 +priority = 1 +parser = UniProtParser + +[source UniProt::protein_id] +# Special source used in UniProtParser. No species uses this source. +name = protein_id +order = 20 +priority = 1 +parser = UniProtParser + +[source UniProt::PDB] +# Special source used in UniProtParser. No species uses this source. +name = PDB +order = 20 +priority = 1 +parser = UniProtParser + +[source UniProt::MEROPS] +# Special source used in UniProtParser. No species uses this source. +name = MEROPS +order = 20 +priority = 1 +parser = UniProtParser + +[source UniProt::EMBL] +# Special source used in UniProtParser. No species uses this source. +name = EMBL +order = 20 +priority = 1 +parser = UniProtParser + +[source UniProt::ChEMBL] +# Special source used in UniProtParser. No species uses this source. +name = ChEMBL +order = 20 +priority = 1 +parser = UniProtParser + +[source UniParc::MULTI] +name = UniParc +order = 20 +priority = 1 +parser = ChecksumParser + +[source RNACentral::MULTI] +name = RNAcentral +order = 1 +priority = 1 +parser = ChecksumParser + +[source PIGGY_trans_name] +name = PIGGY_trans_name +order = 49 +priority = 1 +parser = done_in_official_naming + +[source HGNC_trans_name] +name = HGNC_trans_name +order = 49 +priority = 1 +parser = done_in_official_naming + +[source VGNC_trans_name] +name = VGNC_trans_name +order = 49 +priority = 1 +parser = done_in_official_naming + +[source MGI_automatic_transcript::mus_musculus] +name = MGI_automatic_transcript_notransfer +order = 49 +priority = 1 +parser = done_in_official_naming + +[source MGI_trans_name] +# Used homo_sapiens,mus_musculus +name = MGI_trans_name +order = 70 +priority = 1 +parser = comes via official naming + +[source Clone_based_ensembl_transcript::homo_sapiens] +name = Clone_based_ensembl_transcript +order = 50 +priority = 1 +parser = done_in_official_naming + +[source Clone_based_ensembl_gene::homo_sapiens] +name = Clone_based_ensembl_gene +order = 50 +priority = 1 +parser = done_in_official_naming + +[source Xenopus_Jamboree::xenopus_tropicalis] +# Used by xenopus_tropicalis +name = Xenbase +order = 20 +priority = 1 +parser = XenopusJamboreeParser + +[source ZFIN_ID::danio_rerio#01] +# Used by danio_rerio +name = ZFIN_ID +order = 31 +priority = 1 +prio_descr = uniprot/refseq +parser = ZFINParser + +[source ZFIN_ID::danio_rerio#03] +# Used by danio_rerio +name = ZFIN_ID +order = 1 +priority = 10 +prio_descr = description_only +parser = ZFINDescParser + +[source ZFIN_ID_trans_name] +name = ZFIN_ID_trans_name +order = 49 +priority = 1 +parser = done_in_official_naming + +[source cint_jgi_v1::ciona_intestinalis] +# Used by ciona_intestinalis +name = cint_jgi_v1 +order = 50 +priority = 1 +parser = JGI_ProteinParser + +[source RFAM::MULTI] +# Used by bos_taurus, canis_familiaris, ciona_intestinalis, ciona_savignyi, danio_rerio, dasypus_novemcinctus, drosophila_pseudoobscura, erinaceus_europaeus, ficedula_albicollis, gallus_gallus, gasterosteus_aculeatus, homo_sapiens, loxodonta_africana, macaca_mulatta, monodelphis_domestica, mus_musculus, myotis_lucifugus, oryctolagus_cuniculus, oryzias_latipes, pan_troglodytes,pongo_abelii, rattus_norvegicus, ictidomys_tridecemlineatus, takifugu_rubripes, tupaia_belangeri, xenopus_tropicalis, ornithorhynchus_anatinus +name = RFAM +order = 70 +priority = 1 +parser = RFAMParser + +[source RFAM::EG] +name = RFAM +order = 70 +priority = 1 +parser = CoreXrefParser + + +[source miRBase::MULTI] +# Used by bos_taurus, canis_familiaris, ciona_intestinalis, ciona_savignyi, danio_rerio, dasypus_novemcinctus, erinaceus_europaeus, ficedula_albicollis, gallus_gallus, gasterosteus_aculeatus, homo_sapiens, loxodonta_africana, macaca_mulatta, monodelphis_domestica, mus_musculus, myotis_lucifugus, oryctolagus_cuniculus, oryzias_latipes, pan_troglodytes,pongo_abelii, rattus_norvegicus, ictidomys_tridecemlineatus, takifugu_rubripes, tupaia_belangeri, xenopus_tropicalis, ornithorhynchus_anatinus +name = miRBase +order = 70 +priority = 1 +parser = miRBaseParser + +[source miRBase_trans_name] +# Used homo_sapiens,mus_musculus +name = miRBase_trans_name +order = 70 +priority = 1 +parser = comes via official naming + +[source RFAM_trans_name] +# Used homo_sapiens,mus_musculus +name = RFAM_trans_name +order = 70 +priority = 1 +parser = comes via official naming + +[source Uniprot_gn_trans_name] +# Used by merged species: homo_sapiens,mus_musculus, danio_rerio and sus_scrofa +name = Uniprot_gn_trans_name +order = 70 +priority = 1 +parser = comes via official naming + +[source RNAMMER::MULTI] +# Used by EnsemblGenomes, e.g. aspergillus_clavatus, aspergillus_flavus, aspergillus_fumigatus, aspergillus_nidulans, aspergillus_niger, aspergillus_oryzae, aspergillus_terreus, neosartorya_fischeri +name = RNAMMER +order = 70 +priority = 1 +parser = CoreXrefParser + +[source TRNASCAN_SE::MULTI] +# Used by EnsemblGenomes, e.g. aspergillus_clavatus, aspergillus_flavus, aspergillus_fumigatus, aspergillus_nidulans, aspergillus_niger, aspergillus_oryzae, aspergillus_terreus, neosartorya_fischeri +name = TRNASCAN_SE +order = 70 +priority = 1 +parser = CoreXrefParser + +[source ncRNA_EG::EG] +# Used by EnsemblGenomes, e.g. aspergillus_clavatus, aspergillus_flavus, aspergillus_fumigatus, aspergillus_nidulans, aspergillus_niger, aspergillus_oryzae, aspergillus_terreus, neosartorya_fischeri +# replaces TRNASCAN; RNAMMER and RFAM::EG +name = ncRNA_EG +order = 70 +priority = 1 +parser = CoreXrefParser + +[source misc_EG::EG] +# Used by EnsemblGenomes to maintain sources of xrefs which don't have a proper parser yet. +name = misc_EG +order = 70 +priority = 1 +parser = EG_DBParser + +[source ENA_GENE::MULTI] +# Used by aspergillus_niger +name = ENA_GENE +order = 70 +priority = 1 + +[source CADRE::MULTI] +# Used by aspergillus_clavatus, aspergillus_flavus, aspergillus_fumigatus, aspergillus_nidulans, aspergillus_niger, aspergillus_oryzae, aspergillus_terreus, neosartorya_fischeri +name = CADRE +order = 70 +priority = 1 + +[source CADRE_AFum_A1163::MULTI] +# Used by aspergillus_fumigatusa1163 +name = CADRE_Afum_A1163 +order = 70 +priority = 1 + +[source AspGD::MULTI] +# Used by aspergillus_nidulans +name = AspGD +order = 70 +priority = 1 + +[source GeneDB::MULTI] +# Used by plasmodium_falciparum, trypanosoma_brucei, leishmania_major, schistosoma_mansoni +name = GeneDB +order = 70 +priority = 1 + +[source phatr_jgi_v2::MULTI] +# Used by Pt +name = phatr_jgi_v2 +order = 70 +priority = 1 + +[source phatr_jgi_v2_bd::MULTI] +# Used by Pt +name = phatr_jgi_v2_bd +order = 70 +priority = 1 + +[source thaps_jgi_v2::MULTI] +# Used by Tp +name = thaps_jgi_v2 +order = 70 +priority = 1 + +[source physo1_jgi_v1.1_gene::MULTI] +# Used by Pt +name = physo1_jgi_v1.1_gene +order = 70 +priority = 1 + +[source PGD_GENE::MULTI] +# Used by Tp +name = PGD_GENE +order = 70 +priority = 1 + +[source physo1_jgi_v1.1::MULTI] +# Used by Pt +name = physo1_jgi_v1.1 +order = 70 +priority = 1 + +[source phyra_jgi_v1.1::MULTI] +# Used by Pt +name = phyra_jgi_v1.1 +order = 70 +priority = 1 + +[source BROAD_P_infestans::MULTI] +# Used by phytophthora_infestans +name = BROAD_P_infestans +order = 70 +priority = 1 + +[source thaps_jgi_v2_bd::MULTI] +# Used by Tp +name = thaps_jgi_v2_bd +order = 70 +priority = 1 + + +[source BROAD_U_maydis::MULTI] +# Used by ustilago_maydis +name = BROAD_U_maydis +order = 70 +priority = 1 + +[source BROAD_F_oxysporum::MULTI] +# Used by fusarium_oxysporum +name = BROAD_F_oxysporum +order = 70 +priority = 1 + +[source BROAD_g_zeae::MULTI] +# Used by gibberella_zeae +name = BROAD_G_zeae +order = 70 +priority = 1 + +[source BROAD_G_moniliformis::MULTI] +# Used by gibberella_moniliformis +name = BROAD_G_moniliformis +order = 70 +priority = 1 + +[source SCHISTODB::MULTI] +# Used by schistosoma_mansoni +name = SCHISTODB +order = 70 +priority = 1 + +[source triad_jgi_v1.0::MULTI] +# Used by trichoplax_adhaerens +name = triad_jgi_v1.0 +order = 70 +priority = 1 + +[source wormbase::celegans] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::cbriggsae] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::cbrenneri] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::cremanei] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::cjaponica] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::ppacificus] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::sratti] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + + +[source wormbase::bmalayi] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::ovolvulus] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormbase::tmuris] +name = wormbase_all +order = 50 +priority = 1 +parser = WormbaseDirectParser + +[source wormpep_id::wormbase] +# Used by wormbase core species +name = wormpep_id +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + +[source wormbase_gene::wormbase] +# Used by wormbase core species +name = wormbase_gene +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + +[source wormbase_locus::wormbase] +# Used by wormbase core species +name = wormbase_locus +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + +[source wormbase_gseqname::wormbase] +# Used by wormbase core species +name = wormbase_gseqname +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + +[source wormbase_transcript::wormbase] +# Used by wormbase core species +name = wormbase_transcript +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + +[source wormbase_cds::wormbase] +# Used by wormbase core species +name = wormbase_cds +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + +[source Gramene_Pathway::arabidopsis_thaliana] +# Used by Arabidopsis thaliana, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::brachypodium_distachyon] +# Used by Brachypodium distachyon, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::solanum_lycopersicum] +# Used by Tomato, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::zea_mays] +# Used by Zea mays, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::populus_trichocarpa] +# Used by Poplar, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::solanum_tuberosum] +# Used by Solanum tuberosum, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::oryza_sativa] +# Used by Oryza sativa, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + +[source Gramene_Pathway::sorghum_bicolor] +# Used by Sorghum bicolor, Gramene-specific +name = Gramene_Pathway +order = 50 +priority = 1 +parser = GramenePathwayParser + + +[source PO_GROW::arabidopsis_thaliana] +# Used by Arabidopsis thaliana, Gramene-specific +name = PO +order = 85 +priority = 1 +prio_descr = main +dependent_on = TAIR_TRANSLATION +parser = TAIROntologyParser + +[source PO_STRU::arabidopsis_thaliana] +# Used by Arabidopsis thaliana, Gramene-specific +name = PO +order = 85 +priority = 1 +prio_descr = main +dependent_on = TAIR_TRANSLATION +parser = TAIROntologyParser + +[source TAIR_LOCUS::arabidopsis_thaliana] +# Used by arabidopsis_thaliana +name = TAIR_LOCUS +order = 1 +priority = 1 +parser = TAIRIDParser + +[source TAIR_LOCUS_MODEL::arabidopsis_thaliana] +# Used by arabidopsis_thaliana +name = TAIR_LOCUS_MODEL +order = 1 +priority = 1 +parser = TAIRIDParser + +[source TAIR_SYMBOL::arabidopsis_thaliana] +# Used by arabidopsis_thaliana +name = TAIR_SYMBOL +order = 1 +priority = 1 +parser = TAIRIDParser + +[source TAIR_TRANSLATION::arabidopsis_thaliana] +# Used by arabidopsis_thaliana +name = TAIR_TRANSLATION +order = 1 +priority = 1 + +[source NASC_GENE_ID::arabidopsis_thaliana] +# Used by arabidopsis_thaliana +name = NASC_GENE_ID +order = 1 +priority = 1 +parser = TAIRIDParser + +[source CommunityGO::arabidopsis_thaliana] +# Used by Arabidopsis thaliana, Gramene-specific +name = GO +order = 85 +priority = 1 +prio_descr = main +dependent_on = TAIR_TRANSLATION +parser = TAIROntologyParser + + + + +######################################################################## +# SPECIES # +# # +# Keys: # +# taxonomy_id - taxonomy ID of species/strain # +# (multiple comma separated, required) # +# sources - sources used for this species/strain # +# (multiple comma separated, required) # +# # +######################################################################## + +######################################################################## +# VERTEBRATES # +# # +# Default sources for vertebrates # +# Additional configuration for species-specific sources # +# # +######################################################################## + + +[species sars_cov_2] +taxonomy_id = 2697049 +sources = RefSeq_peptide::MULTI,EntrezGene::MULTI,Uniprot/SWISSPROT::MULTI + +[species vertebrates] +taxonomy_id = 7742 +sources = EntrezGene::MULTI,Reactome::MULTI,RNACentral::MULTI,RefSeq_dna::MULTI-vertebrate,RefSeq_peptide::MULTI-vertebrate,RefSeq_import::otherfeatures,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,UniParc::MULTI,RFAM::MULTI,miRBase::MULTI,ArrayExpress::MULTI,VGNC::vertebrate + +[species homo_sapiens] +taxonomy_id = 9606 +sources = CCDS::homo_sapiens,DBASS3::homo_sapiens,DBASS5::homo_sapiens,HPA::homo_sapiens,HGNC::homo_sapiens#02,MIM::homo_sapiens,MIM2GENE::homo_sapiens,UCSC::homo_sapiens,RefSeq_dna::gencode,RefSeq_peptide::gencode,Uniprot/SPTREMBL::gencode,Uniprot/SWISSPROT::gencode + +[species mus_musculus] +taxonomy_id = 10090 +sources = CCDS::mus_musculus,EntrezGene::MULTI,MGI::mus_musculus#01,MGI::mus_musculus#05,UCSC::mus_musculus,RefSeq_dna::gencode,RefSeq_peptide::gencode,Uniprot/SPTREMBL::gencode,Uniprot/SWISSPROT::gencode + +[species danio_rerio] +taxonomy_id = 7955 +sources = ZFIN_ID::danio_rerio#01,ZFIN_ID::danio_rerio#03 + +[species rattus_norvegicus] +taxonomy_id = 10116 +sources = RGD::rattus_norvegicus,RGD::rattus_norvegicus#02 + + + +[species ciona_intestinalis] +taxonomy_id = 7719 +sources = cint_jgi_v1::ciona_intestinalis + +[species xenopus_tropicalis] +taxonomy_id = 8364 +sources = Xenopus_Jamboree::xenopus_tropicalis + +######################################################################## +# METAZOA # +# # +# Default sources for metazoa # +# Additional configuration for species-specific sources # +# # +######################################################################## + +[species metazoa] +taxonomy_id = 33208 +sources = EntrezGene::MULTI,RefSeq_dna::MULTI-invertebrate,RefSeq_peptide::MULTI-invertebrate,Uniprot/SPTREMBL::MULTI-invertebrate,Uniprot/SWISSPROT::MULTI-invertebrate,UniParc::MULTI,ArrayExpress::EG,ncRNA_EG::EG,RNAMMER::MULTI,miRBase::MULTI,RFAM::EG,TRNASCAN_SE::MULTI,misc_EG::EG + +[species drosophila_melanogaster] +taxonomy_id = 7227 +sources = flybase_gene_id::drosophila_melanogaster + +[species drosophila_pseudoobscura] +taxonomy_id = 46245 +sources = flybase_gene_id::drosophila_pseudoobscura + +[species drosophila_ananassae] +taxonomy_id = 7217 +sources = flybase_gene_id::drosophila_ananassae + +[species drosophila_erecta] +taxonomy_id = 7220 +sources = flybase_gene_id::drosophila_erecta + +[species drosophila_grimshawi] +taxonomy_id = 7222 +sources = flybase_gene_id::drosophila_grimshawi + +[species drosophila_mojavensis] +taxonomy_id = 7230 +sources = flybase_gene_id::drosophila_mojavensis + +[species drosophila_persimilis] +taxonomy_id = 7234 +sources = flybase_gene_id::drosophila_persimilis + +[species drosophila_sechellia] +taxonomy_id = 7238 +sources = flybase_gene_id::drosophila_sechellia + +[species drosophila_simulans] +taxonomy_id = 7240 +sources = flybase_gene_id::drosophila_simulans + +[species drosophila_virilis] +taxonomy_id = 7244 +sources = flybase_gene_id::drosophila_virilis + +[species drosophila_willistoni] +taxonomy_id = 7260 +sources = flybase_gene_id::drosophila_willistoni + +[species drosophila_yakuba] +taxonomy_id = 7245 +sources = flybase_gene_id::drosophila_yakuba + +######################################################################## +# FUNGI # +# # +# Default sources for fungi # +# Additional configuration for species-specific sources # +# # +######################################################################## + +[species fungi] +taxonomy_id = 4751 +sources = EntrezGene::MULTI,RefSeq_dna::MULTI-fungi,RefSeq_peptide::MULTI-fungi,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,PHIbase::MULTI,ArrayExpress::EG,RFAM::EG,miRBase::MULTI,RNAMMER::MULTI,TRNASCAN_SE::MULTI,misc_EG::EG + +[species saccharomyces_cerevisiae] +taxonomy_id = 4932,559292 +sources = SGD::saccharomyces_cerevisiae + +[species schizosaccharomyces_pombe] +taxonomy_id = 4896,284812 +sources = PomBase::schizosaccharomyces_pombe + +######################################################################## +# PLANTS # +# # +# Default sources for plants # +# Additional configuration for species-specific sources # +# # +######################################################################## + +[species plants] +taxonomy_id = 33090 +sources = EntrezGene::MULTI,Reactome::MULTI,RNACentral::MULTI,RefSeq_dna::MULTI-Plants,RefSeq_import::otherfeatures,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,UniParc::MULTI,RFAM::MULTI,miRBase::MULTI,ArrayExpress::MULTI,ncRNA_EG::EG,misc_EG::EG + +[species glycine_max] +taxonomy_id = 3847 +sources = PHYTOZOME_GMAX_GENE::glycine_max + +[species solanum_lycopersicum] +taxonomy_id = 4081 +sources = Gramene_Pathway::solanum_lycopersicum + +[species solanum_tuberosum] +taxonomy_id = 4113 +sources = PGSC_GENE::solanum_tuberosum,Gramene_Pathway::solanum_tuberosum + +[species arabidopsis_thaliana] +taxonomy_id = 3702 +sources = Gramene_Pathway::arabidopsis_thaliana,TAIR_LOCUS::arabidopsis_thaliana,CommunityGO::arabidopsis_thaliana,PO_GROW::arabidopsis_thaliana,PO_STRU::arabidopsis_thaliana + +[species brachypodium_distachyon] +taxonomy_id = 15368 +sources = Gramene_Pathway::brachypodium_distachyon + +[species oryza_sativa] +taxonomy_id = 39947 +sources = Gramene_Pathway::oryza_sativa + +[species populus_trichocarpa] +taxonomy_id = 3694 +sources = Gramene_Pathway::populus_trichocarpa + +[species sorghum_bicolor] +taxonomy_id = 4558,91525,171959 +sources = Gramene_Pathway::sorghum_bicolor + +[species zea_mays] +taxonomy_id = 4577,112001,381124,334825,4579,76912 +sources = Gramene_Pathway::zea_mays + +[species caenorhabditis_elegans] +taxonomy_id = 6239 +sources = wormbase::celegans + +[species caenorhabditis_briggsae] +taxonomy_id = 6238 +sources = wormbase::cbriggsae + +[species caenorhabditis_remanei] +taxonomy_id = 31234 +sources = wormbase::cremanei + +[species caenorhabditis_brenneri] +taxonomy_id = 135651 +sources = wormbase::cbrenneri + +[species caenorhabditis_japonica] +taxonomy_id = 281687 +sources = wormbase::cjaponica + +[species brugia_malayi] +taxonomy_id = 6279 +sources = wormbase::bmalayi + +[species onchocerca_volvulus] +taxonomy_id = 6282 +sources = wormbase::ovolvulus + +[species pristionchus_pacificus] +taxonomy_id = 54126 +sources = wormbase::ppacificus + +[species strongyloides_ratti] +taxonomy_id = 34506 +sources = wormbase::sratti + +[species trichuris_muris] +taxonomy_id = 70415 +sources = wormbase::tmuris + +######################################################################## +# PROTISTS # +# # +# Default sources for protists # +# Additional configuration for species-specific sources # +# # +######################################################################## + +[species protist] +taxonomy_id = 2759 +sources = EntrezGene::MULTI,RefSeq_dna::MULTI-complete,RefSeq_peptide::MULTI-complete,Uniprot/SPTREMBL::MULTI,Uniprot/SWISSPROT::MULTI,TRNASCAN_SE::MULTI,RNAMMER::MULTI,ArrayExpress::EG,PHIbase::MULTI,miRBase::MULTI,misc_EG::EG,RFAM::EG + diff --git a/src/python/scripts/genome_info.py b/src/python/scripts/genome_info.py new file mode 100755 index 000000000..48ade41b2 --- /dev/null +++ b/src/python/scripts/genome_info.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +""" +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2024] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +''' +Fetch Genome Info from the new metadata api +''' + +import argparse +import logging +import sys +import json +import configparser +from os import getenv +from os.path import isdir +from os.path import join, isfile, realpath +from ensembl.production.metadata.api.genome import GenomeAdaptor + +logging.basicConfig(level=logging.INFO, format='%(message)s') +logger = logging.getLogger(__name__) + +def main(): + parser = argparse.ArgumentParser( + prog='genome_info.py', + description='Fetch Ensembl genome info from new metadata API' + ) + parser.add_argument('-g', '--genome_uuid', type=str, nargs='*', required=False, default=None, help='genome UUID, ex: a23663571,b236571') + parser.add_argument('-s', '--species', type=str, nargs='*', required=False, default=None, help='Ensembl species names, ex: homo_sapiens,mus_musculus') + parser.add_argument('-d', '--organism_group', type=str, nargs='*', required=False, default=None, help='versioned file name, ex: EnsemblVertbrates,EnsemblPlants') + parser.add_argument('-p', '--organism_group_type', type=str, nargs='*', required=False, default=None, help='organism group type, ex: Division') + parser.add_argument('-u', '--unreleased_genomes', help='Fetch only unreleased genome and datasets', action='store_true') + parser.add_argument('-n', '--dataset_name', type=str, nargs='*', required=False, default=None, help='ensembl dataset type to fetch unique genomes, ex: assembly, genebuild') + parser.add_argument('-r', '--dataset_source', type=str, nargs='*', required=False, default=None, help='ensembl dataset source, ex: homo_sapiens_core_111_38') + parser.add_argument('-m', '--metadata_db_uri', type=str, required=True, help='metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata') + parser.add_argument('-t', '--taxonomy_db_uri', type=str, required=True, help='taxonomy db mysql uri, ex: mysql://ensro@localhost:3366/ncbi_taxonomy') + parser.add_argument('-o', '--output', type=str, required=True, help='output file ex: genome_info.json') + + args = parser.parse_args() + + print(args) + #default values + genome_uuid = args.genome_uuid + species = args.species + organism_group = args.organism_group + organism_group_type = args.organism_group_type + dataset_name = args.dataset_name + dataset_source = args.dataset_source + + + #required values + unreleased_genomes = args.unreleased_genomes + metadata_db_uri = args.metadata_db_uri + taxonomy_db_uri = args.taxonomy_db_uri + output_file_name = args.output + + genome_info_obj = GenomeAdaptor(metadata_uri=metadata_db_uri, taxonomy_uri=taxonomy_db_uri) + with open(output_file_name, 'w') as json_output: + for genome in genome_info_obj.fetch_genomes_info(genome_uuid=genome_uuid, + ensembl_name=species, + group=organism_group, + group_type=organism_group_type, + dataset_name=dataset_name, + dataset_source=dataset_source, + unreleased_genomes=unreleased_genomes) or []: + + genome_info = { + "genome_id" : genome[0]['genome'][0].genome_uuid, + "species" : genome[0]['genome'][1].ensembl_name, + "assembly" : genome[0]['genome'][2].assembly_default, + "assembly_name" : genome[0]['genome'][2].ensembl_name, + "assembly_accession" : genome[0]['genome'][2].accession, + "assembly_level" : genome[0]['genome'][2].level, + "division" : genome[0]['genome'][-1].name, + "database" : genome[0]['datasets'][-1][-1].name, + "database_type" : genome[0]['datasets'][-1][-1].type + } + json.dump(genome_info, json_output) + json_output.write("\n") + +if __name__ == '__main__': + main() diff --git a/src/python/scripts/run_module.py b/src/python/scripts/run_module.py new file mode 100644 index 000000000..874f02dd8 --- /dev/null +++ b/src/python/scripts/run_module.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2024] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib + +from ensembl.common.Params import Params + +def main(): + params = Params() + + module_name = params.param_required('module') + class_name = module_name.split(".")[-1] + + module = importlib.import_module(module_name) + module_class = getattr(module, class_name) + module_instance = module_class() + + module_instance.run() + +if __name__ == '__main__': + main() diff --git a/travisci/kyotocabinet-perl-1.20.tar.gz b/travisci/kyotocabinet-perl-1.20.tar.gz new file mode 100644 index 000000000..77200c05b Binary files /dev/null and b/travisci/kyotocabinet-perl-1.20.tar.gz differ