|
36 | 36 | ORGANISM_RESOURCE = "ncbitaxon_removed_subset.json"
|
37 | 37 | EMPTY_ORGANISM_OUTFILE = RAW_DATA_DIR / "uniprot_empty_organism.tsv"
|
38 | 38 |
|
| 39 | +# Define UNIPROT_S3_DIR globally |
| 40 | +if RAW_DATA_DIR.is_dir(): |
| 41 | + UNIPROT_S3_DIR = Path(RAW_DATA_DIR).joinpath("s3") |
| 42 | +else: |
| 43 | + UNIPROT_S3_DIR = "s3" |
| 44 | +UNIPROT_S3_DIR.mkdir(parents=True, exist_ok=True) |
| 45 | + |
| 46 | + |
39 | 47 |
|
40 | 48 | # Function to read organisms from a CSV file and return a set
|
41 | 49 | def _read_organisms_from_csv(file_path):
|
@@ -92,7 +100,6 @@ def run_api(show_status: bool, input_dir=RAW_DATA_DIR) -> None:
|
92 | 100 | :param api: A string pointing to the API to upload data to.
|
93 | 101 | :return: None
|
94 | 102 | """
|
95 |
| - global UNIPROT_S3_DIR |
96 | 103 | proteome_organism_list = run_proteome_api(show_status)
|
97 | 104 | UNIPROT_S3_DIR = Path(input_dir).joinpath("s3")
|
98 | 105 | UNIPROT_S3_DIR.mkdir(parents=True, exist_ok=True)
|
@@ -181,12 +188,17 @@ def fetch_uniprot_data(organism_id):
|
181 | 188 | def fetch_uniprot_reference_proteome_data() -> list:
|
182 | 189 | """Single URL request for Uniprot proteome data."""
|
183 | 190 | file_path = Path(RAW_DATA_DIR) / f"{PROTEOMES_FILENAME}.{UNIPROT_DESIRED_FORMAT}"
|
184 |
| - all_proteomes_query = "%28*%29" |
| 191 | + # all_proteomes_query = "%28*%29" |
| 192 | + filtered_proteomes_query = ( |
| 193 | + "(*)+AND+((superkingdom:Bacteria)+OR+(superkingdom:Archaea))" |
| 194 | + "+AND+((proteome_type:1)+OR+(proteome_type:2))" |
| 195 | + ) |
| 196 | + |
185 | 197 |
|
186 | 198 | url = construct_query_url(
|
187 | 199 | UNIPROT_REFERENCE_PROTEOMES_URL,
|
188 | 200 | UNIPROT_DESIRED_FORMAT,
|
189 |
| - all_proteomes_query, |
| 201 | + filtered_proteomes_query, |
190 | 202 | UNIPROT_REFERENCE_PROTEOMES_FIELDS,
|
191 | 203 | UNIPROT_SIZE,
|
192 | 204 | )
|
@@ -301,7 +313,7 @@ def run_uniprot_api_parallel(
|
301 | 313 | fetch_func = partial(fetch_uniprot_data)
|
302 | 314 | # If show_status is True, use process_map to display a progress bar
|
303 | 315 | if show_status:
|
304 |
| - process_map(fetch_func, taxa_id_common_with_proteomes_list, max_workers=workers) |
| 316 | + process_map(fetch_func, taxa_id_common_with_proteomes_list, max_workers=workers, chunksize=999) |
305 | 317 | else:
|
306 | 318 | # Set up a pool of worker processes without a progress bar
|
307 | 319 | with multiprocessing.Pool(processes=workers) as pool:
|
|
0 commit comments