Skip to content

Commit 5744f93

Browse files
committed
Updated query with additional filtering terms
1 parent 9b0428d commit 5744f93

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

src/uniprot2s3/main.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@
3636
ORGANISM_RESOURCE = "ncbitaxon_removed_subset.json"
3737
EMPTY_ORGANISM_OUTFILE = RAW_DATA_DIR / "uniprot_empty_organism.tsv"
3838

39+
# Define UNIPROT_S3_DIR globally
40+
if RAW_DATA_DIR.is_dir():
41+
UNIPROT_S3_DIR = Path(RAW_DATA_DIR).joinpath("s3")
42+
else:
43+
UNIPROT_S3_DIR = "s3"
44+
UNIPROT_S3_DIR.mkdir(parents=True, exist_ok=True)
45+
46+
3947

4048
# Function to read organisms from a CSV file and return a set
4149
def _read_organisms_from_csv(file_path):
@@ -92,7 +100,6 @@ def run_api(show_status: bool, input_dir=RAW_DATA_DIR) -> None:
92100
:param api: A string pointing to the API to upload data to.
93101
:return: None
94102
"""
95-
global UNIPROT_S3_DIR
96103
proteome_organism_list = run_proteome_api(show_status)
97104
UNIPROT_S3_DIR = Path(input_dir).joinpath("s3")
98105
UNIPROT_S3_DIR.mkdir(parents=True, exist_ok=True)
@@ -181,12 +188,17 @@ def fetch_uniprot_data(organism_id):
181188
def fetch_uniprot_reference_proteome_data() -> list:
182189
"""Single URL request for Uniprot proteome data."""
183190
file_path = Path(RAW_DATA_DIR) / f"{PROTEOMES_FILENAME}.{UNIPROT_DESIRED_FORMAT}"
184-
all_proteomes_query = "%28*%29"
191+
# all_proteomes_query = "%28*%29"
192+
filtered_proteomes_query = (
193+
"(*)+AND+((superkingdom:Bacteria)+OR+(superkingdom:Archaea))"
194+
"+AND+((proteome_type:1)+OR+(proteome_type:2))"
195+
)
196+
185197

186198
url = construct_query_url(
187199
UNIPROT_REFERENCE_PROTEOMES_URL,
188200
UNIPROT_DESIRED_FORMAT,
189-
all_proteomes_query,
201+
filtered_proteomes_query,
190202
UNIPROT_REFERENCE_PROTEOMES_FIELDS,
191203
UNIPROT_SIZE,
192204
)
@@ -301,7 +313,7 @@ def run_uniprot_api_parallel(
301313
fetch_func = partial(fetch_uniprot_data)
302314
# If show_status is True, use process_map to display a progress bar
303315
if show_status:
304-
process_map(fetch_func, taxa_id_common_with_proteomes_list, max_workers=workers)
316+
process_map(fetch_func, taxa_id_common_with_proteomes_list, max_workers=workers, chunksize=999)
305317
else:
306318
# Set up a pool of worker processes without a progress bar
307319
with multiprocessing.Pool(processes=workers) as pool:

0 commit comments

Comments
 (0)