added main.py

Knowledge-Graph-Hub · Jun 6, 2024 · 8e51b2e · 8e51b2e
1 parent 3e3aedb
commit 8e51b2e
Showing 1 changed file with 60 additions and 42 deletions.
diff --git a/src/uniprot2s3/main.py b/src/uniprot2s3/main.py
@@ -33,7 +33,9 @@
 )
 from .dummy_tqdm import DummyTqdm
 
-ORGANISM_RESOURCE = "ncbitaxon_removed_subset.json"
+# ORGANISM_RESOURCE = "ncbitaxon_removed_subset.json"
+DISBIOME_ORGANISM_RESOURCE = "Disbiome_Microbe_Labels.csv"
+PD_ORGANISM_RESOURCE = "PD_Microbe_Labels.csv"
 EMPTY_ORGANISM_OUTFILE = RAW_DATA_DIR / "uniprot_empty_organism.tsv"
 
 
@@ -61,17 +63,21 @@ def get_organism_list(input_dir: Union[Path, str] = RAW_DATA_DIR) -> List[str]:
 
     :param organism_list: List of organism IDs.
     """
-    # Read organism resource file and extract organism IDs
-    with open(Path(input_dir) / ORGANISM_RESOURCE, "r") as f:
-        contents = json.load(f)
-        ncbi_prefix = NCBITAXON_PREFIX.replace(":", "_")
-
-    # Create a list of organism IDs after filtering and cleaning
-    organism_list = [
-        i["id"].split(ncbi_prefix)[1]
-        for i in contents["graphs"][0]["nodes"]
-        if ncbi_prefix in i["id"] and i["id"].split(ncbi_prefix)[1].isdigit()
-    ]
+    # Read disbiome organism resource file and extract organism IDs
+    with open(Path(input_dir) / DISBIOME_ORGANISM_RESOURCE, "r") as f:
+        organism_list = []
+        reader = csv.DictReader(f, delimiter='\t')
+        # Create a list of organism IDs after filtering and cleaning
+        for row in reader:
+            organism_list.append(row["entity_uri"].split(NCBITAXON_PREFIX)[1])
+
+    # Read PD organism resource file and extract organism IDs
+    with open(Path(input_dir) / PD_ORGANISM_RESOURCE, "r") as f:
+        reader = csv.DictReader(f, delimiter='\t')
+        # Create a list of organism IDs after filtering and cleaning
+        for row in reader:
+            organism_list.append(row["entity_uri"].split(NCBITAXON_PREFIX)[1])
+
     # Update organism list based on existing empty request files
     for file_path in [EMPTY_ORGANISM_OUTFILE]:
         if file_path.is_file():
@@ -151,6 +157,8 @@ def fetch_uniprot_data(organism_id):
 
     :param organism_id: Just if the ID of the NCBITaxon entity.
     """
+    global UNIPROT_S3_DIR
+    UNIPROT_S3_DIR = Path(RAW_DATA_DIR).joinpath("s3")
     file_path = UNIPROT_S3_DIR / f"{organism_id}.{UNIPROT_DESIRED_FORMAT}"
     organism_query = TAXONOMY_ID_UNIPROT_PREFIX + organism_id
 
@@ -191,42 +199,53 @@ def fetch_uniprot_reference_proteome_data() -> list:
         UNIPROT_SIZE,
     )
 
-    try:
-        # Make the HTTP request to Uniprot
-        response = requests.get(url, timeout=30)
-        response.raise_for_status()
-        # Write response to file if it contains data
-        if len(response.text.strip().split("\n")) > 1:
-            with open(file_path, "w") as file:
-                file.write(response.text)
-
-        while "next" in response.links:
-            next_url = response.links["next"]["url"]
-            response = requests.get(next_url, timeout=30)
+    if not file_path.exists():
+        try:
+            # Make the HTTP request to Uniprot
+            response = requests.get(url, timeout=30)
             response.raise_for_status()
             # Write response to file if it contains data
             if len(response.text.strip().split("\n")) > 1:
-                with open(file_path, "a") as file:
+                with open(file_path, "w") as file:
                     file.write(response.text)
 
-        # Read file to df for sorting
-        df = pd.read_csv(file_path, sep="\t", low_memory=False)
-        df = df.sort_values(
-            by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
-        )
-        df.to_csv(file_path, sep="\t", index=False)
+            while "next" in response.links:
+                next_url = response.links["next"]["url"]
+                response = requests.get(next_url, timeout=30)
+                response.raise_for_status()
+                # Write response to file if it contains data
+                if len(response.text.strip().split("\n")) > 1:
+                    with open(file_path, "a") as file:
+                        file.write(response.text)
+            # Read file to df for sorting
+            df = pd.read_csv(file_path, sep="\t", low_memory=False)
+            df = df.sort_values(
+                by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
+            )
+            df.to_csv(file_path, sep="\t", index=False)
+
+            organism_ids = df[PROTEOMES_ORGANISM_ID_COLUMNNAME].unique().tolist()
+
+            return organism_ids
+
+        except requests.exceptions.HTTPError:
+            print(f"Bad request for {PROTEOMES_FILENAME} - {response.status_code}")
+        except requests.exceptions.Timeout:
+            print("The request timed out")
+        except requests.exceptions.RequestException as e:
+            print(f"An error occurred: {e}")
 
-        organism_ids = df[PROTEOMES_ORGANISM_ID_COLUMNNAME].unique().tolist()
+    else:
+            # Read file to df for sorting
+            df = pd.read_csv(file_path, sep="\t", low_memory=False)
+            df = df.sort_values(
+                by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
+            )
+            df.to_csv(file_path, sep="\t", index=False)
 
-        return organism_ids
-
-    except requests.exceptions.HTTPError:
-        print(f"Bad request for {PROTEOMES_FILENAME} - {response.status_code}")
-    except requests.exceptions.Timeout:
-        print("The request timed out")
-    except requests.exceptions.RequestException as e:
-        print(f"An error occurred: {e}")
+            organism_ids = df[PROTEOMES_ORGANISM_ID_COLUMNNAME].unique().tolist()
 
+            return organism_ids
 
 def run_uniprot_api(taxa_id_from_proteomes_set, show_status: bool) -> None:
     """
@@ -279,7 +298,6 @@ def run_uniprot_api_parallel(
     # ! Cannot be used during multiprocessing
     # Cache HTTP requests to avoid repeated calls
     # requests_cache.install_cache("uniprot_cache")
-
     organism_list = get_organism_list(input_dir=input_dir)
 
     # Sort list
@@ -293,7 +311,7 @@ def run_uniprot_api_parallel(
             f.write(f"{line}\n")
 
     #!For testing
-    # taxa_id_common_with_proteomes_list = taxa_id_common_with_proteomes_list[0:5]
+    #taxa_id_common_with_proteomes_list = taxa_id_common_with_proteomes_list[0:5]
 
     # Set up a pool of worker processes
     with multiprocessing.Pool(processes=workers) as pool: