Skip to content

Commit

Permalink
fix fetching of datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
specktakel committed Nov 6, 2024
1 parent d8f8bc2 commit 8375806
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 33 deletions.
90 changes: 62 additions & 28 deletions icecube_tools/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,32 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)

icecube_data_base_url = "https://icecube.wisc.edu/data-releases"
# icecube_data_base_url = "https://icecube.wisc.edu/data-releases"
data_directory = os.path.abspath(os.path.join(os.path.expanduser("~"), ".icecube_data"))

available_datasets = {
"20210126": {
"url": "https://dataverse.harvard.edu/api/access/dataset/:persistentId/?persistentId=doi:10.7910/DVN/VKL316",
"dir": "20210126_PS-IC40-IC86_VII",
"subdir": "icecube_10year_ps",
},
"20181018": {
"url": "https://icecube.wisc.edu/data-releases/20181018_All-sky_point-source_IceCube_data%20_years_2010-2012.zip",
"dir": "20181018_All-sky_point-source_IceCube_data%20_years_2010-2012",
"subdir": ""
},
"20150820": {
"url": "https://icecube.wisc.edu/data-releases/20150820_Astrophysical_muon_neutrino_flux_in_the_northern_sky_with_2_years_of_IceCube_data.zip",
"dir": "20150820_Astrophysical_muon_neutrino_flux_in_the_northern_sky_with_2_years_of_IceCube_data",
"subdir": ""
},
"20131121": {
"url": "https://icecube.wisc.edu/data-releases/20131121_Search_for_contained_neutrino_events_at_energies_above_30_TeV_in_2_years_of_data.zip",
"dir": "20131121_Search_for_contained_neutrino_events_at_energies_above_30_TeV_in_2_years_of_data",
"subdir": "",
}
}

available_irf_periods = ["IC40", "IC59", "IC79", "IC86_I", "IC86_II"]

available_data_periods = [
Expand All @@ -50,10 +73,10 @@ class IceCubeData:

def __init__(
self,
base_url=icecube_data_base_url,
#base_url=icecube_data_base_url,
data_directory=data_directory,
cache_name=".cache",
update=False,
# update=False,
):
"""
Handle the interface with IceCube's public data
Expand All @@ -65,7 +88,7 @@ def __init__(
:param update: Refresh the cache if true
"""

self.base_url = base_url
#self.base_url = base_url

self.data_directory = data_directory

Expand All @@ -74,21 +97,21 @@ def __init__(
expire_after=-1,
)

self.ls(verbose=False, update=update)

# Make data directory if it doesn't exist
if not os.path.exists(self.data_directory):
os.makedirs(self.data_directory)


def ls(self, verbose=True, update=False):
"""
List the available datasets.
:param verbose: Print the datasets if true
:param update: Refresh the cache if true
"""
raise NotImplementedError()

self.datasets = []
available_datasets = []

if update:
requests_cache.clear()
Expand All @@ -104,7 +127,7 @@ def ls(self, verbose=True, update=False):
href = link.get("href")

if ".zip" in href:
self.datasets.append(href)
available_datasets.append(href)

if verbose:
print(href)
Expand All @@ -116,7 +139,7 @@ def find(self, search_string):

found_datasets = []

for dataset in self.datasets:
for dataset in available_datasets:
if search_string in dataset:
found_datasets.append(dataset)

Expand All @@ -137,44 +160,49 @@ def fetch(self, datasets, overwrite=False, write_to=None):
self.data_directory = write_to

for dataset in datasets:
if dataset not in self.datasets:
if dataset not in available_datasets:
raise ValueError(
"Dataset %s is not in list of known datasets" % dataset
)

url = os.path.join(self.base_url, dataset)

local_path = os.path.join(self.data_directory, dataset)


ds = available_datasets[dataset]
url = ds["url"]
dl_dir = ds["dir"]
local_path = os.path.join(self.data_directory, dl_dir)
subdir = ds["subdir"]
file = os.path.join(local_path, dl_dir+".zip")
# Only fetch if not already there!
if not os.path.exists(os.path.splitext(local_path)[0]) or overwrite:
if not os.path.exists(local_path) or overwrite:
os.makedirs(local_path, exist_ok=True)
# Don't cache this as we want to stream
with requests_cache.disabled():
response = requests.get(url, stream=True)

if response.ok:
total = int(response.headers["content-length"])

# For progress bar description
short_name = dataset
if len(dataset) > 40:
short_name = dataset[0:40] + "..."

# Save locally
with open(local_path, "wb") as f, tqdm(
desc=short_name, total=total
with open(file, "wb") as f, tqdm(
desc=short_name,
) as bar:
for chunk in response.iter_content(chunk_size=1024 * 1024):
size = f.write(chunk)
bar.update(size)

# Unzip
dataset_dir = os.path.splitext(local_path)[0]
with ZipFile(local_path, "r") as zip_ref:
if subdir:
dataset_dir = os.path.join(local_path, subdir)
else:
dataset_dir = local_path
with ZipFile(file, "r") as zip_ref:
zip_ref.extractall(dataset_dir)

# Delete zipfile
os.remove(local_path)
os.remove(file)

# Check for further compressed files in the extraction
tar_files = find_files(dataset_dir, ".tar")
Expand All @@ -198,22 +226,28 @@ def fetch_all_to(self, write_to, overwrite=False):
"""
Download all data to a given location
"""

self.fetch(self.datasets, write_to=write_to, overwrite=overwrite)
raise NotImplementedError()
self.fetch(list(available_datasets.keys()), write_to=write_to, overwrite=overwrite)

def get_path_to(self, dataset):
"""
Get path to a given dataset
"""

if dataset not in self.datasets:
if dataset not in available_datasets.keys():
raise ValueError("Dataset is not available")

ds = available_datasets[dataset]
dl_dir = ds["dir"]
local_path = os.path.join(self.data_directory, dl_dir)
subdir = ds["subdir"]
#file = os.path.join(local_path, dl_dir+".zip")

local_zip_loc = os.path.join(self.data_directory, dataset)
#local_zip_loc = os.path.join(self.data_directory, dataset)

local_path = os.path.splitext(local_zip_loc)[0]
path = os.path.join(local_path, subdir)

return local_path
return path


class ddict(dict):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_angular_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_kappa_conversion():

assert theta_1sigma == approx(theta_p)


"""
def test_angular_resolution():
# Load
Expand All @@ -33,7 +33,7 @@ def test_angular_resolution():
# Return angular error
assert ang_res.ret_ang_err == ang_res.get_ret_ang_err(Etrue)

"""

def test_r2021_irf():

Expand Down
6 changes: 3 additions & 3 deletions tests/test_data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

my_data = IceCubeData()


"""
def test_data_scan():
assert my_data.datasets[1] == "20080911_AMANDA_7_Year_Data.zip"

"""

def test_file_download(output_directory):

found_dataset = my_data.find("AMANDA")
found_dataset = ["20181018"]

my_data.fetch(found_dataset, write_to=output_directory)

Expand Down

0 comments on commit 8375806

Please sign in to comment.