BrentLab · ejiawustl · Aug 28, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,11 @@
+#mac files
+**/.DS_Store
+
 # Dataset directory
 data/
 
 # logs
-logs/
+**/logs/
 
 # local tmp files
 tmp/*

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: "^docs/|devcontainer.json"
+exclude: ^docs/|devcontainer.json|.*/snapshots/
 default_stages: [commit]
 
 default_language_version:

diff --git a/docs/tutorials/database_interface.ipynb b/docs/tutorials/database_interface.ipynb
@@ -2142,7 +2142,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -2156,9 +2156,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.1"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/docs/tutorials/exploring_perturbation_response_relationship.ipynb b/docs/tutorials/exploring_perturbation_response_relationship.ipynb
diff --git a/docs/tutorials/generate_in_silico_data.ipynb b/docs/tutorials/generate_in_silico_data.ipynb
diff --git a/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb b/docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ jupyter = "^1.0.0"
 requests = "^2.32.3"
 cachetools = "^5.3.3"
 python-dotenv = "^1.0.1"
+statsmodels = "^0.14.1"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

diff --git a/yeastdnnexplorer/interface/AbstractAPI.py b/yeastdnnexplorer/interface/AbstractAPI.py
@@ -40,7 +40,7 @@ def __init__(
             ParamsDict and Cache constructors.
 
         """
-        self.logger = logging.getLogger(__name__)
+        self.logger = logging.getLogger(self.__class__.__name__)
         self._token = token or os.getenv("TOKEN", "")
         self.url = url or os.getenv("BASE_URL", "")
         self.params = ParamsDict(
@@ -159,9 +159,11 @@ def _is_valid_url(self, url: str) -> None:
 
         """
         try:
+            # note that with allow_redirect=True the result can be a 300 status code
+            # which is not an error, and then another request to the redirected URL
             response = requests.head(url, headers=self.header, allow_redirects=True)
             if response.status_code != 200:
-                raise ValueError(f"Invalid URL or token provided: {response.content}")
+                raise ValueError("Invalid URL or token provided. Check both.")
         except requests.RequestException as e:
             raise AttributeError(f"Error validating URL: {e}") from e
         except AttributeError as e:

diff --git a/yeastdnnexplorer/interface/AbstractRecordsAndFilesAPI.py b/yeastdnnexplorer/interface/AbstractRecordsAndFilesAPI.py
@@ -1,6 +1,5 @@
 import csv
 import gzip
-import logging
 import os
 import tarfile
 import tempfile
@@ -109,16 +108,21 @@ async def read(
         Retrieve data from the endpoint according to the `retrieve_files` parameter. If
         `retrieve_files` is False, the records will be returned as a dataframe. If
         `retrieve_files` is True, the files associated with the records will be
-        retrieved either from the local cache or from the database.
+        retrieved either from the local cache or from the database. Note that a user can
+        select which effect_colname and pvalue_colname is used for a genomicfile (see
+        database documentation for more details). If one or both of those are present in
+        the params, and retrieve_file is true, then that column name is added to the
+        cache_key. Eg if record 1 is being retrieved from mcisaac data with
+        effect_colname "log2_raio", then the cache_key for that data will be
+        "1_log2_ratio". The default effect colname, which is set by the database, will
+        be stored with only the record id as the cache_key.
 
         :param callback: The function to call with the metadata. Signature must
             include `metadata`, `data`, and `cache`.
         :type callback: Callable[[pd.DataFrame, dict[str, Any] | None, Any], Any]
         :param retrieve_files: Boolean. Whether to retrieve the files associated with
             the records. Defaults to False.
         :type retrieve_files: bool
-        :param kwargs: Additional arguments to pass to the callback function.
-        :type kwargs: Any
 
         :return: The result of the callback function.
         :rtype: Any
@@ -133,7 +137,7 @@ async def read(
             )
 
         export_url = f"{self.url.rstrip('/')}/{self.export_url_suffix}"
-        self.logger.debug("export_url: %s", export_url)
+        self.logger.debug("read() export_url: %s", export_url)
 
         async with aiohttp.ClientSession() as session:
             try:
@@ -157,10 +161,10 @@ async def read(
                         )
 
             except aiohttp.ClientError as e:
-                logging.error(f"Error in GET request: {e}")
+                self.logger.error(f"Error in GET request: {e}")
                 raise
             except pd.errors.ParserError as e:
-                logging.error(f"Error reading request content: {e}")
+                self.logger.error(f"Error reading request content: {e}")
                 raise
 
     async def _retrieve_files(
@@ -197,28 +201,34 @@ async def _retrieve_file(
         :type record_id: int
         :return: A DataFrame containing the file's data.
         :rtype: pd.DataFrame
-        :raises FileNotFoundError: If the file is not found in the tar archive.
-        :raises ValueError: If the delimiter is not supported.
 
         """
         export_files_url = f"{self.url.rstrip('/')}/{self.export_files_url_suffix}"
-        self.logger.debug("export_url: %s", export_files_url)
-        # Try to get the data from the cache first
+        self.logger.debug("_retrieve_file() export_url: %s", export_files_url)
+
+        # set key for local cache
         cache_key = str(record_id)
+        if "effect_colname" in self.params:
+            cache_key += f"_{self.params['effect_colname']}"
+        if "pvalue_colname" in self.params:
+            cache_key += f"_{self.params['pvalue_colname']}"
         cached_data = self._cache_get(cache_key)
         if cached_data is not None:
-            logging.info(f"Record ID {record_id} retrieved from cache.")
+            self.logger.info(f"cache_key {cache_key} retrieved from cache.")
             return pd.read_json(BytesIO(cached_data.encode()))
+        else:
+            self.logger.debug(f"cache_key {cache_key} not found in cache.")
 
-        # Retrieve from the database if not in cache
-        logging.info(
-            f"Record ID {record_id} not found in cache. Retrieving from the database."
-        )
         try:
             header = self.header.copy()
             header["Content-Type"] = "application/gzip"
+            retrieve_files_params = self.params.copy()
+            retrieve_files_params.update({"id": record_id})
             async with session.get(
-                export_files_url, headers=header, params={"id": record_id}, timeout=120
+                export_files_url,
+                headers=header,
+                params=retrieve_files_params,
+                timeout=120,
             ) as response:
                 response.raise_for_status()
                 tar_data = await response.read()
@@ -236,8 +246,8 @@ async def _retrieve_file(
                     with tarfile.open(fileobj=tar_file, mode="r:gz") as tar:
                         tar_members = tar.getmembers()
                         self.logger.debug(
-                            "Tar file contains: ",
-                            "{[member.name for member in tar_members]}",
+                            f"Tar file contains: "
+                            f"{[member.name for member in tar_members]}",
                         )
 
                         # Find the specific file to extract
@@ -269,11 +279,12 @@ async def _retrieve_file(
                     df = pd.read_csv(csv_path, delimiter=delimiter)
 
                     # Store the data in the cache
+                    self.logger.debug(f"Storing {cache_key} in cache.")
                     self._cache_set(cache_key, df.to_json())
             finally:
                 os.unlink(tar_file.name)
 
             return df
         except Exception as e:
-            logging.error(f"Error retrieving file for record ID {record_id}: {e}")
+            self.logger.error(f"Error retrieving file for cache_key {cache_key}: {e}")
             raise
diff --git a/yeastdnnexplorer/interface/AbstractRecordsOnlyAPI.py b/yeastdnnexplorer/interface/AbstractRecordsOnlyAPI.py
@@ -56,7 +56,7 @@ async def read(
             )
 
         export_url = f"{self.url.rstrip('/')}/{export_url_suffix}"
-        self.logger.debug("export_url: %s", export_url)
+        self.logger.debug("read() export_url: %s", export_url)
 
         async with aiohttp.ClientSession() as session:
             try:

diff --git a/yeastdnnexplorer/interface/ExpressionAPI.py b/yeastdnnexplorer/interface/ExpressionAPI.py
@@ -33,6 +33,8 @@ def __init__(self, **kwargs) -> None:
                 "lab",
                 "assay",
                 "workflow",
+                "effect_colname",
+                "pvalue_colname",
             ],
         )
 

diff --git a/yeastdnnexplorer/interface/RankResponseAPI.py b/yeastdnnexplorer/interface/RankResponseAPI.py
@@ -1,6 +1,5 @@
 import gzip
 import json
-import logging
 import os
 import tarfile
 import tempfile
@@ -123,10 +122,10 @@ async def read(
                         return callback(metadata, data, self.cache, **additional_args)
 
                 except aiohttp.ClientError as e:
-                    logging.error(f"Error in GET request: {e}")
+                    self.logger.error(f"Error in GET request: {e}")
                     raise
                 except pd.errors.ParserError as e:
-                    logging.error(f"Error reading request content: {e}")
+                    self.logger.error(f"Error reading request content: {e}")
                     raise
 
     def _extract_files(

diff --git a/...r/tests/interface/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete b/...r/tests/interface/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete
@@ -0,0 +1 @@
+None
diff --git a/...orer/tests/interface/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set b/...orer/tests/interface/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set
@@ -0,0 +1 @@
+test_value
diff --git a/yeastdnnexplorer/tests/interface/snapshots/test_AbstractAPI/test_cache_operations/cache_list b/yeastdnnexplorer/tests/interface/snapshots/test_AbstractAPI/test_cache_operations/cache_list
@@ -0,0 +1 @@
+test_key
diff --git a/yeastdnnexplorer/tests/interface/test_AbstractAPI.py b/yeastdnnexplorer/tests/interface/test_AbstractAPI.py
@@ -40,16 +40,16 @@ def test_initialize(snapshot, api_client):
 def test_push_params(snapshot, api_client):
     params = {"param1": "value1", "param2": "value2"}
     api_client.push_params(params)
-    snapshot.assert_match(api_client.params.as_dict(), "push_params")
+    api_client.params.as_dict() == "push_params"
 
 
 def test_pop_params(snapshot, api_client):
     params = {"param1": "value1", "param2": "value2"}
     api_client.push_params(params)
     api_client.pop_params(["param1"])
-    snapshot.assert_match(api_client.params.as_dict(), "pop_params_after_one_removed")
+    assert api_client.params.as_dict() == {"param2": "value2"}
     api_client.pop_params()
-    snapshot.assert_match(api_client.params.as_dict(), "pop_params_after_all_removed")
+    assert api_client.params.as_dict() == {}
 
 
 def test_is_valid_url(snapshot, api_client):
@@ -66,13 +66,13 @@ def test_cache_operations(snapshot, api_client):
     value = "test_value"
 
     api_client._cache_set(key, value)
-    snapshot.assert_match(api_client._cache_get(key), "cache_get_after_set")
+    snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_set")
 
     keys = api_client._cache_list()
-    snapshot.assert_match(keys, "cache_list")
+    snapshot.assert_match(", ".join(keys), "cache_list")
 
     api_client._cache_delete(key)
-    snapshot.assert_match(api_client._cache_get(key), "cache_get_after_delete")
+    snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_delete")
 
 
 if __name__ == "__main__":

diff --git a/yeastdnnexplorer/tests/interface/test_AbstractRecordsOnlyAPI.py b/yeastdnnexplorer/tests/interface/test_AbstractRecordsOnlyAPI.py
@@ -1,3 +1,4 @@
+import gzip
 from typing import Any
 
 import pandas as pd
@@ -42,11 +43,14 @@ async def test_read(snapshot, api_client):
             "10939,1,2024-03-26,1,2024-03-26 14:29:47.853980+00:00,4327,4,6,5,promotersetsig/10939.csv.gz"  # noqa: E501
         )
 
+        # Convert to bytes and gzip the content
+        gzipped_csv = gzip.compress(mocked_csv.encode("utf-8"))
+
         m.get(
             "https://example.com/api/endpoint/export",
             status=200,
-            body=mocked_csv,
-            headers={"Content-Type": "text/csv"},
+            body=gzipped_csv,
+            headers={"Content-Type": "application/gzip"},
         )
 
         result = await api_client.read()