Skip to content

Commit

Permalink
added main.py
Browse files Browse the repository at this point in the history
  • Loading branch information
bsantan committed Jun 6, 2024
1 parent 3e3aedb commit 8e51b2e
Showing 1 changed file with 60 additions and 42 deletions.
102 changes: 60 additions & 42 deletions src/uniprot2s3/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
)
from .dummy_tqdm import DummyTqdm

ORGANISM_RESOURCE = "ncbitaxon_removed_subset.json"
# ORGANISM_RESOURCE = "ncbitaxon_removed_subset.json"
DISBIOME_ORGANISM_RESOURCE = "Disbiome_Microbe_Labels.csv"
PD_ORGANISM_RESOURCE = "PD_Microbe_Labels.csv"
EMPTY_ORGANISM_OUTFILE = RAW_DATA_DIR / "uniprot_empty_organism.tsv"


Expand Down Expand Up @@ -61,17 +63,21 @@ def get_organism_list(input_dir: Union[Path, str] = RAW_DATA_DIR) -> List[str]:
:param organism_list: List of organism IDs.
"""
# Read organism resource file and extract organism IDs
with open(Path(input_dir) / ORGANISM_RESOURCE, "r") as f:
contents = json.load(f)
ncbi_prefix = NCBITAXON_PREFIX.replace(":", "_")

# Create a list of organism IDs after filtering and cleaning
organism_list = [
i["id"].split(ncbi_prefix)[1]
for i in contents["graphs"][0]["nodes"]
if ncbi_prefix in i["id"] and i["id"].split(ncbi_prefix)[1].isdigit()
]
# Read disbiome organism resource file and extract organism IDs
with open(Path(input_dir) / DISBIOME_ORGANISM_RESOURCE, "r") as f:
organism_list = []
reader = csv.DictReader(f, delimiter='\t')
# Create a list of organism IDs after filtering and cleaning
for row in reader:
organism_list.append(row["entity_uri"].split(NCBITAXON_PREFIX)[1])

# Read PD organism resource file and extract organism IDs
with open(Path(input_dir) / PD_ORGANISM_RESOURCE, "r") as f:
reader = csv.DictReader(f, delimiter='\t')
# Create a list of organism IDs after filtering and cleaning
for row in reader:
organism_list.append(row["entity_uri"].split(NCBITAXON_PREFIX)[1])

# Update organism list based on existing empty request files
for file_path in [EMPTY_ORGANISM_OUTFILE]:
if file_path.is_file():
Expand Down Expand Up @@ -151,6 +157,8 @@ def fetch_uniprot_data(organism_id):
:param organism_id: Just if the ID of the NCBITaxon entity.
"""
global UNIPROT_S3_DIR
UNIPROT_S3_DIR = Path(RAW_DATA_DIR).joinpath("s3")
file_path = UNIPROT_S3_DIR / f"{organism_id}.{UNIPROT_DESIRED_FORMAT}"
organism_query = TAXONOMY_ID_UNIPROT_PREFIX + organism_id

Expand Down Expand Up @@ -191,42 +199,53 @@ def fetch_uniprot_reference_proteome_data() -> list:
UNIPROT_SIZE,
)

try:
# Make the HTTP request to Uniprot
response = requests.get(url, timeout=30)
response.raise_for_status()
# Write response to file if it contains data
if len(response.text.strip().split("\n")) > 1:
with open(file_path, "w") as file:
file.write(response.text)

while "next" in response.links:
next_url = response.links["next"]["url"]
response = requests.get(next_url, timeout=30)
if not file_path.exists():
try:
# Make the HTTP request to Uniprot
response = requests.get(url, timeout=30)
response.raise_for_status()
# Write response to file if it contains data
if len(response.text.strip().split("\n")) > 1:
with open(file_path, "a") as file:
with open(file_path, "w") as file:
file.write(response.text)

# Read file to df for sorting
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.sort_values(
by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
)
df.to_csv(file_path, sep="\t", index=False)
while "next" in response.links:
next_url = response.links["next"]["url"]
response = requests.get(next_url, timeout=30)
response.raise_for_status()
# Write response to file if it contains data
if len(response.text.strip().split("\n")) > 1:
with open(file_path, "a") as file:
file.write(response.text)
# Read file to df for sorting
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.sort_values(
by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
)
df.to_csv(file_path, sep="\t", index=False)

organism_ids = df[PROTEOMES_ORGANISM_ID_COLUMNNAME].unique().tolist()

return organism_ids

except requests.exceptions.HTTPError:
print(f"Bad request for {PROTEOMES_FILENAME} - {response.status_code}")
except requests.exceptions.Timeout:
print("The request timed out")
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")

organism_ids = df[PROTEOMES_ORGANISM_ID_COLUMNNAME].unique().tolist()
else:
# Read file to df for sorting
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.sort_values(
by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
)
df.to_csv(file_path, sep="\t", index=False)

return organism_ids

except requests.exceptions.HTTPError:
print(f"Bad request for {PROTEOMES_FILENAME} - {response.status_code}")
except requests.exceptions.Timeout:
print("The request timed out")
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
organism_ids = df[PROTEOMES_ORGANISM_ID_COLUMNNAME].unique().tolist()

return organism_ids

def run_uniprot_api(taxa_id_from_proteomes_set, show_status: bool) -> None:
"""
Expand Down Expand Up @@ -279,7 +298,6 @@ def run_uniprot_api_parallel(
# ! Cannot be used during multiprocessing
# Cache HTTP requests to avoid repeated calls
# requests_cache.install_cache("uniprot_cache")

organism_list = get_organism_list(input_dir=input_dir)

# Sort list
Expand All @@ -293,7 +311,7 @@ def run_uniprot_api_parallel(
f.write(f"{line}\n")

#!For testing
# taxa_id_common_with_proteomes_list = taxa_id_common_with_proteomes_list[0:5]
#taxa_id_common_with_proteomes_list = taxa_id_common_with_proteomes_list[0:5]

# Set up a pool of worker processes
with multiprocessing.Pool(processes=workers) as pool:
Expand Down

0 comments on commit 8e51b2e

Please sign in to comment.