From d6a9a3595e8f1e799935717f541c1675381ee795 Mon Sep 17 00:00:00 2001 From: bsantan <70932395+bsantan@users.noreply.github.com> Date: Thu, 6 Jun 2024 12:21:45 -0600 Subject: [PATCH] check if organisms already exist in s3 dir --- src/uniprot2s3/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/uniprot2s3/main.py b/src/uniprot2s3/main.py index cb5a4a0..77d0d20 100644 --- a/src/uniprot2s3/main.py +++ b/src/uniprot2s3/main.py @@ -299,6 +299,11 @@ def run_uniprot_api_parallel( # Cache HTTP requests to avoid repeated calls # requests_cache.install_cache("uniprot_cache") organism_list = get_organism_list(input_dir=input_dir) + # See which organisms have already been downloaded + existing_organism_ids = os.listdir(UNIPROT_S3_DIR) + existing_organism_ids = [file for file in existing_organism_ids if file.endswith('.tsv')] + existing_organism_ids = [file.replace('.tsv','') for file in existing_organism_ids] + organism_list = list(set(organism_list).difference(set(existing_organism_ids))) # Sort list taxa_id_common_with_proteomes_list = list(set(organism_list).intersection(taxa_id_from_proteomes_list))