Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
ialbert committed Apr 10, 2024
2 parents 0403878 + a088adb commit 25039d3
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ build:
# Publish the package
publish: test build
hatch publish

#REMOTE=www@biostarhandbook.com:/home/www/book/data_www/bio

# Upload prebuilt data to distribution site.
Expand Down
1 change: 0 additions & 1 deletion src/biorun/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
ENA_FIELDS = f"{ENA_API}/returnFields"
ENA_REPORT = f"{ENA_API}/filereport"


# The assembly summary file
ASSEMBLY_SUMMARY_URL = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt'
ASSEMBLY_SUMMARY_PATH = "assembly_summary_genbank.txt"
Expand Down
80 changes: 80 additions & 0 deletions src/biorun/search2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Bio search : search for information using accession numbers
"""
import re

# SRR numbers: SRR5260547
SRR_PATT = re.compile(r'(ERR|SRR|DRR|SRP|ERP)\d+')
SRR_TYPE = 'srr'

# Bioproject numbers: PRJNA374918
PRJ_PATT = re.compile(r'PRJ([A-Z])+\d+')
PRJ_TYPE = 'bioproject'

# Genbank accessions: NC_045512
GBK_PATT = re.compile(r'(?P<letters>[a-zA-Z]+)(?P<under>_?)(?P<digits>\d+)(\.(?P<version>\d+))?')
GBK_TYPE = 'genbank'

# Ensemble ids: ENSG00000157764
ENS_PATT = re.compile(r'ENS[A-Z]+\d+')
ENS_TYPE = 'ensembl'

# A pattern that matches words followed by a colon and the literal wordgene
GENE_PATT = re.compile(r'(?P<word>\w+):gene', flags=re.IGNORECASE)
GENE_TYPE = 'gene'

# GEO accessions: GSM123456, GSE123456, GPL123456, GDS123456
GEO_PATT = re.compile(r'(GSM|GSE|GPL|GDS)\d+')
GEO_TYPE = 'geo'

# NCBI assembly ids: GCF_000001405.39, GCA_000001405.15
ASM_PATT = re.compile(r'(GCA|GCF)_\d+')
ASM_TYPE = 'assembly'

# Patterns and types.
PATTERNS = [
(ASM_TYPE, ASM_PATT),
(GEO_TYPE, GEO_PATT),
(GENE_TYPE, GENE_PATT),
(SRR_TYPE, SRR_PATT),
(PRJ_TYPE, PRJ_PATT),
(ENS_TYPE, ENS_PATT),
(GBK_TYPE, GBK_PATT),
]

# ENA API urls.
ENA_API = "https://www.ebi.ac.uk/ena/portal/api"
ENA_FIELDS = f"{ENA_API}/returnFields"
ENA_REPORT = f"{ENA_API}/filereport"

def get_match(acc):
for dtype, pattern in PATTERNS:
if pattern.match(acc):
return dtype
return None

def run(acc):

dtype = get_match(acc)


if not dtype:
print(f"# No match for: {acc}")
else:
print (acc, dtype)

print("-"* 10)


if __name__ == '__main__':
accs = [
"NP_001191", "SRR5260547", "PRJNA374918", "HAD3:gene",
"ENSG00000157764", "GSM123456", "GSE123456",
"GCF_000001405.39",
"GCA_000001405",
"NP_001191.1", "ecoli"
]
for acc in accs:
run(acc)


0 comments on commit 25039d3

Please sign in to comment.