-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmultifastaloader.py
104 lines (76 loc) · 2.85 KB
/
multifastaloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
This script calculates a couple of common bioinformatic metrics.
It takes a multifasta file as input and returns a pandas dataframe with the metrics.
In other words, it is a dataloading scripts that returns a fasta file with the sequences.
the file is called sequences.fasta and is saved in the updated_data folder.
"""
# Importing the libraries and modules that we are going to use to retrieve the data from NCBI
# Required python library: Biopython
# Required modules:
# 1.Entrez for retrieving sequences
# 2.SeqIO for exploring the sequences
# 3.os provides functions for interacting with the operating system
import os
import re
from Bio import Entrez
from Bio import SeqIO
# To use NCBI'S E-Utilities you must specify your email address with each request
# This function grabs your email from your .bashrc file
# it should look like this to work export email="johndoe@provider.com"
Entrez.email = os.getenv("EMAIL")
# change working directory:
os.chdir("updated_data")
def multifastaloader(*accession_numbers):
"""This function takes a list of accession numbers as input,
fetches the fasta sequences from NCBI and then outputs them in a multi fasta file ;sequences.fasta
Parameters
----------
*accession_numbers : a list of strings with accession numbers used in NCBI to retrieve sequences with those identifiers.
Returns
-------
type
Nonetype
A single multifasta file is returned to the working directory.
Example
-------
>>> multifastaloader("NC_000913.3", "NC_000913.3")
None
"""
try:
for i in accession_numbers:
query = Entrez.efetch(
db="nucleotide", id=i, rettype="fasta", retmode="text"
)
record = SeqIO.parse(
query, "fasta"
) # converting the query to a sequence record in biopython inorder to explore the sequences
# writing the query to a file
with open("sequences.fasta", "a", encoding="UTF-8") as multi_fasta:
SeqIO.write(record, multi_fasta, "fasta")
except UserWarning as error:
print(f"An error occurred: {error}")
def markdown_reader():
"""
This function reads a markdown file in data.md and returns a list of accession numbers
Parameters
----------
None
Returns
-------
list
A list of accession numbers
Example
-------
>>> markdown_reader()
['NC_000913.3', 'NC_000913.3']
"""
with open("data.md", "r", encoding="UTF-8") as file:
data = file.read()
accession_numbers = re.findall(r"[A-Z]{2}\d{6}\.\d{1}", data)
accession_numbers = list(set(accession_numbers))
return accession_numbers
def main():
"""runs the multifastaloader function and the markdown_reader function"""
multifastaloader(*markdown_reader())
if __name__ == "__main__":
main()