Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add effect/pvalue selectors to ExpressionAPI() #103

Merged
merged 4 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#mac files
**/.DS_Store

# Dataset directory
data/

# logs
logs/
**/logs/

# local tmp files
tmp/*
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
exclude: "^docs/|devcontainer.json"
exclude: ^docs/|devcontainer.json|.*/snapshots/
default_stages: [commit]

default_language_version:
Expand Down
6 changes: 3 additions & 3 deletions docs/tutorials/database_interface.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2142,7 +2142,7 @@
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -2156,9 +2156,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
4,260 changes: 4,260 additions & 0 deletions docs/tutorials/exploring_perturbation_response_relationship.ipynb

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion docs/tutorials/generate_in_silico_data.ipynb

Large diffs are not rendered by default.

850 changes: 691 additions & 159 deletions docs/tutorials/visualizing_and_testing_data_generation_methods.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jupyter = "^1.0.0"
requests = "^2.32.3"
cachetools = "^5.3.3"
python-dotenv = "^1.0.1"
statsmodels = "^0.14.1"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
Expand Down
6 changes: 4 additions & 2 deletions yeastdnnexplorer/interface/AbstractAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(
ParamsDict and Cache constructors.

"""
self.logger = logging.getLogger(__name__)
self.logger = logging.getLogger(self.__class__.__name__)
self._token = token or os.getenv("TOKEN", "")
self.url = url or os.getenv("BASE_URL", "")
self.params = ParamsDict(
Expand Down Expand Up @@ -159,9 +159,11 @@ def _is_valid_url(self, url: str) -> None:

"""
try:
# note that with allow_redirect=True the result can be a 300 status code
# which is not an error, and then another request to the redirected URL
response = requests.head(url, headers=self.header, allow_redirects=True)
if response.status_code != 200:
raise ValueError(f"Invalid URL or token provided: {response.content}")
raise ValueError("Invalid URL or token provided. Check both.")
except requests.RequestException as e:
raise AttributeError(f"Error validating URL: {e}") from e
except AttributeError as e:
Expand Down
51 changes: 31 additions & 20 deletions yeastdnnexplorer/interface/AbstractRecordsAndFilesAPI.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import csv
import gzip
import logging
import os
import tarfile
import tempfile
Expand Down Expand Up @@ -109,16 +108,21 @@ async def read(
Retrieve data from the endpoint according to the `retrieve_files` parameter. If
`retrieve_files` is False, the records will be returned as a dataframe. If
`retrieve_files` is True, the files associated with the records will be
retrieved either from the local cache or from the database.
retrieved either from the local cache or from the database. Note that a user can
select which effect_colname and pvalue_colname is used for a genomicfile (see
database documentation for more details). If one or both of those are present in
the params, and retrieve_file is true, then that column name is added to the
cache_key. Eg if record 1 is being retrieved from mcisaac data with
effect_colname "log2_raio", then the cache_key for that data will be
"1_log2_ratio". The default effect colname, which is set by the database, will
be stored with only the record id as the cache_key.

:param callback: The function to call with the metadata. Signature must
include `metadata`, `data`, and `cache`.
:type callback: Callable[[pd.DataFrame, dict[str, Any] | None, Any], Any]
:param retrieve_files: Boolean. Whether to retrieve the files associated with
the records. Defaults to False.
:type retrieve_files: bool
:param kwargs: Additional arguments to pass to the callback function.
:type kwargs: Any

:return: The result of the callback function.
:rtype: Any
Expand All @@ -133,7 +137,7 @@ async def read(
)

export_url = f"{self.url.rstrip('/')}/{self.export_url_suffix}"
self.logger.debug("export_url: %s", export_url)
self.logger.debug("read() export_url: %s", export_url)

async with aiohttp.ClientSession() as session:
try:
Expand All @@ -157,10 +161,10 @@ async def read(
)

except aiohttp.ClientError as e:
logging.error(f"Error in GET request: {e}")
self.logger.error(f"Error in GET request: {e}")
raise
except pd.errors.ParserError as e:
logging.error(f"Error reading request content: {e}")
self.logger.error(f"Error reading request content: {e}")
raise

async def _retrieve_files(
Expand Down Expand Up @@ -197,28 +201,34 @@ async def _retrieve_file(
:type record_id: int
:return: A DataFrame containing the file's data.
:rtype: pd.DataFrame
:raises FileNotFoundError: If the file is not found in the tar archive.
:raises ValueError: If the delimiter is not supported.

"""
export_files_url = f"{self.url.rstrip('/')}/{self.export_files_url_suffix}"
self.logger.debug("export_url: %s", export_files_url)
# Try to get the data from the cache first
self.logger.debug("_retrieve_file() export_url: %s", export_files_url)

# set key for local cache
cache_key = str(record_id)
if "effect_colname" in self.params:
cache_key += f"_{self.params['effect_colname']}"
if "pvalue_colname" in self.params:
cache_key += f"_{self.params['pvalue_colname']}"
cached_data = self._cache_get(cache_key)
if cached_data is not None:
logging.info(f"Record ID {record_id} retrieved from cache.")
self.logger.info(f"cache_key {cache_key} retrieved from cache.")
return pd.read_json(BytesIO(cached_data.encode()))
else:
self.logger.debug(f"cache_key {cache_key} not found in cache.")

# Retrieve from the database if not in cache
logging.info(
f"Record ID {record_id} not found in cache. Retrieving from the database."
)
try:
header = self.header.copy()
header["Content-Type"] = "application/gzip"
retrieve_files_params = self.params.copy()
retrieve_files_params.update({"id": record_id})
async with session.get(
export_files_url, headers=header, params={"id": record_id}, timeout=120
export_files_url,
headers=header,
params=retrieve_files_params,
timeout=120,
) as response:
response.raise_for_status()
tar_data = await response.read()
Expand All @@ -236,8 +246,8 @@ async def _retrieve_file(
with tarfile.open(fileobj=tar_file, mode="r:gz") as tar:
tar_members = tar.getmembers()
self.logger.debug(
"Tar file contains: ",
"{[member.name for member in tar_members]}",
f"Tar file contains: "
f"{[member.name for member in tar_members]}",
)

# Find the specific file to extract
Expand Down Expand Up @@ -269,11 +279,12 @@ async def _retrieve_file(
df = pd.read_csv(csv_path, delimiter=delimiter)

# Store the data in the cache
self.logger.debug(f"Storing {cache_key} in cache.")
self._cache_set(cache_key, df.to_json())
finally:
os.unlink(tar_file.name)

return df
except Exception as e:
logging.error(f"Error retrieving file for record ID {record_id}: {e}")
self.logger.error(f"Error retrieving file for cache_key {cache_key}: {e}")
raise
2 changes: 1 addition & 1 deletion yeastdnnexplorer/interface/AbstractRecordsOnlyAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ async def read(
)

export_url = f"{self.url.rstrip('/')}/{export_url_suffix}"
self.logger.debug("export_url: %s", export_url)
self.logger.debug("read() export_url: %s", export_url)

async with aiohttp.ClientSession() as session:
try:
Expand Down
2 changes: 2 additions & 0 deletions yeastdnnexplorer/interface/ExpressionAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def __init__(self, **kwargs) -> None:
"lab",
"assay",
"workflow",
"effect_colname",
"pvalue_colname",
],
)

Expand Down
5 changes: 2 additions & 3 deletions yeastdnnexplorer/interface/RankResponseAPI.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import gzip
import json
import logging
import os
import tarfile
import tempfile
Expand Down Expand Up @@ -123,10 +122,10 @@ async def read(
return callback(metadata, data, self.cache, **additional_args)

except aiohttp.ClientError as e:
logging.error(f"Error in GET request: {e}")
self.logger.error(f"Error in GET request: {e}")
raise
except pd.errors.ParserError as e:
logging.error(f"Error reading request content: {e}")
self.logger.error(f"Error reading request content: {e}")
raise

def _extract_files(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
None
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test_value
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test_key
12 changes: 6 additions & 6 deletions yeastdnnexplorer/tests/interface/test_AbstractAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ def test_initialize(snapshot, api_client):
def test_push_params(snapshot, api_client):
params = {"param1": "value1", "param2": "value2"}
api_client.push_params(params)
snapshot.assert_match(api_client.params.as_dict(), "push_params")
api_client.params.as_dict() == "push_params"


def test_pop_params(snapshot, api_client):
params = {"param1": "value1", "param2": "value2"}
api_client.push_params(params)
api_client.pop_params(["param1"])
snapshot.assert_match(api_client.params.as_dict(), "pop_params_after_one_removed")
assert api_client.params.as_dict() == {"param2": "value2"}
api_client.pop_params()
snapshot.assert_match(api_client.params.as_dict(), "pop_params_after_all_removed")
assert api_client.params.as_dict() == {}


def test_is_valid_url(snapshot, api_client):
Expand All @@ -66,13 +66,13 @@ def test_cache_operations(snapshot, api_client):
value = "test_value"

api_client._cache_set(key, value)
snapshot.assert_match(api_client._cache_get(key), "cache_get_after_set")
snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_set")

keys = api_client._cache_list()
snapshot.assert_match(keys, "cache_list")
snapshot.assert_match(", ".join(keys), "cache_list")

api_client._cache_delete(key)
snapshot.assert_match(api_client._cache_get(key), "cache_get_after_delete")
snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_delete")


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gzip
from typing import Any

import pandas as pd
Expand Down Expand Up @@ -42,11 +43,14 @@ async def test_read(snapshot, api_client):
"10939,1,2024-03-26,1,2024-03-26 14:29:47.853980+00:00,4327,4,6,5,promotersetsig/10939.csv.gz" # noqa: E501
)

# Convert to bytes and gzip the content
gzipped_csv = gzip.compress(mocked_csv.encode("utf-8"))

m.get(
"https://example.com/api/endpoint/export",
status=200,
body=mocked_csv,
headers={"Content-Type": "text/csv"},
body=gzipped_csv,
headers={"Content-Type": "application/gzip"},
)

result = await api_client.read()
Expand Down
Loading