Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(PXP-10739): Add bucket region to DRS #355

Draft
wants to merge 19 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@
"filename": "tests/test_aliases_endpoints.py",
"hashed_secret": "5666c088b494f26cd8f63ace013992f5fc391ce0",
"is_verified": false,
"line_number": 41
"line_number": 43
}
],
"tests/test_bundles.py": [
Expand All @@ -284,7 +284,7 @@
"filename": "tests/test_bundles.py",
"hashed_secret": "5666c088b494f26cd8f63ace013992f5fc391ce0",
"is_verified": false,
"line_number": 30
"line_number": 25
},
{
"type": "Hex High Entropy String",
Expand Down Expand Up @@ -328,70 +328,70 @@
"filename": "tests/test_client.py",
"hashed_secret": "15a6d8daad1278efcaadc0d6e3d1dd2d9ebbc262",
"is_verified": false,
"line_number": 1084
"line_number": 1092
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "1b0d1a618b5c213dd792bbc3aa96ffa6bc370ef3",
"is_verified": false,
"line_number": 1300
"line_number": 1308
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "1170ace44158ff189902ff44597efef121623353",
"is_verified": false,
"line_number": 1731
"line_number": 1741
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "ff9c79b737b3ea7386618cc9437d3fb0a772182b",
"is_verified": false,
"line_number": 2406
"line_number": 2416
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "c8176f1e75e62e15dabaa4087fb7194451c8f6d2",
"is_verified": false,
"line_number": 2409
"line_number": 2419
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "d5198f8eddb1cbeb437899cd99e5ee97ab8531b4",
"is_verified": false,
"line_number": 2409
"line_number": 2419
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "02dc196562514eaa3e2feac1f441ccf6ad81e09d",
"is_verified": false,
"line_number": 2413
"line_number": 2423
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "f1cb2d91a95165a2ab909eadd9f7b65f312c7e2d",
"is_verified": false,
"line_number": 2414
"line_number": 2424
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "58db546de03270b55a4c889a5c5e6296b29fef25",
"is_verified": false,
"line_number": 2415
"line_number": 2425
},
{
"type": "Hex High Entropy String",
"filename": "tests/test_client.py",
"hashed_secret": "b6c0bd08fde409c18760f32bef8705191840c402",
"is_verified": false,
"line_number": 2416
"line_number": 2426
}
],
"tests/test_deprecated_aliases_endpoints.py": [
Expand All @@ -409,9 +409,9 @@
"filename": "tests/test_drs.py",
"hashed_secret": "5666c088b494f26cd8f63ace013992f5fc391ce0",
"is_verified": false,
"line_number": 38
"line_number": 37
}
]
},
"generated_at": "2023-04-20T22:58:41Z"
"generated_at": "2023-05-02T20:30:00Z"
}
5 changes: 5 additions & 0 deletions indexd/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from alembic.config import main as alembic_main
import cdislogging
from distutils.command.config import config
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you still need this config here?

import flask

from indexd.index.drivers.alchemy import Base as IndexBase
Expand All @@ -15,6 +16,8 @@
from .drs.blueprint import blueprint as indexd_guid_blueprint
from .guid.blueprint import blueprint as indexd_drs_blueprint
from .blueprint import blueprint as cross_blueprint
from .cache import cache

from indexd.urls.blueprint import blueprint as index_urls_blueprint


Expand Down Expand Up @@ -54,6 +57,8 @@ def app_init(app, settings=None):
def get_app(settings=None):
app = flask.Flask("indexd")

cache.init_app(app)

if "INDEXD_SETTINGS" in os.environ:
sys.path.append(os.environ["INDEXD_SETTINGS"])

Expand Down
5 changes: 5 additions & 0 deletions indexd/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from distutils.command.config import config
from functools import cache
from flask_caching import Cache

cache = Cache(config={"CACHE_TYPE": "simple"})
78 changes: 78 additions & 0 deletions indexd/index/drivers/alchemy.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import os
import time

import datetime
from urllib import response
import uuid
import json
from contextlib import contextmanager
from cdislogging import get_logger
import requests
from sqlalchemy import (
BigInteger,
Column,
Expand All @@ -22,7 +27,9 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import joinedload, relationship, sessionmaker
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
import urllib.parse

from indexd.cache import cache
from indexd import auth
from indexd.errors import UserError, AuthError
from indexd.index.driver import IndexDriverABC
Expand Down Expand Up @@ -108,6 +115,72 @@ class IndexRecord(Base):
"IndexRecordAlias", backref="index_record", cascade="all, delete-orphan"
)

def url_to_bucket_region_mapping(self, url):
"""
Map the url location of a bucket to its region

Args:
url(str): The url of the object location in the bucket

Returns:
region(str): The region of the bucket where the object is located
"""

storage_to_config_map = {"s3": "S3_BUCKETS", "gs": "GS_BUCKETS"}
parsed_url = urllib.parse.urlparse(url)
cloud_storage_service = parsed_url.scheme
bucket_name = parsed_url.netloc

bucket_region_info = cache.get("bucket_region_info")

# if cache not found then try to retrieve info from fence and cache it
if bucket_region_info is None:
hostname = os.environ["HOSTNAME"]
fence_url = "http://" + hostname + "/user/bucket_info/region"
retry_count = 0
while retry_count < 3:
response = requests.get(fence_url)
if response.status_code == 200:
if response.json() != None:
# set cache for an hour
bucket_region_info = response.json()
cache.set("bucket_region_info", response.json(), timeout=3600)
else:
print(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the logger for this.

"/bucket_info/region from fence returned 200 but no data found"
)
break
else:
print(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the logger.

"/bucket_info/region from fence returned status {} with {}".format(
response.status_code(), response.json()
)
)
time.sleep(2**3)
retry_count += 1
return None

# if bucket_region_info is still empty that means that there's no bucket configured in the fence config
if bucket_region_info is None:
# checks cloud provider -> cloud bucket
if (
bucket_name
in bucket_region_info[storage_to_config_map[cloud_storage_service]]
):
return bucket_region_info[storage_to_config_map[cloud_storage_service]][
bucket_name
]
else:
print(
"Bucket not configured in fence config for {}".format(
cloud_storage_service + "://" + bucket_name
)
)
return None
else:
print("No buckets not configured in fence config")
return None

def to_document_dict(self):
"""
Get the full index document
Expand All @@ -118,9 +191,14 @@ def to_document_dict(self):
hashes = {h.hash_type: h.hash_value for h in self.hashes}
metadata = {m.key: m.value for m in self.index_metadata}

# Call fence /bucket_info/region endpoint to fill some of the urls metadata
urls_metadata = {
u.url: {m.key: m.value for m in u.url_metadata} for u in self.urls
}

for u in urls:
urls_metadata[u]["region"] = self.url_to_bucket_region_mapping(u)

created_date = self.created_date.isoformat()
updated_date = self.updated_date.isoformat()
content_created_date = (
Expand Down
Loading