Skip to content

Commit

Permalink
get_tabular_rows can accept a list of urls in url
Browse files Browse the repository at this point in the history
  • Loading branch information
Mike committed Sep 22, 2022
1 parent 409541a commit 57a951c
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 81 deletions.
27 changes: 18 additions & 9 deletions doc/main.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ Arguments about choosing between dict and list are all made consistent - dict_fo
## Downloading files

Various utilities to help with downloading files. Includes retrying by default.
The `Download` class inherits from `BaseDownload` which specifies a number of standard
methods that all downloaders should have: `download_file`, `download_text`,
`download_yaml`, `download_json` and `get_tabular_rows`.

For example, given YAML file extraparams.yml:

Expand Down Expand Up @@ -163,25 +166,31 @@ Other useful functions:
Download.get_column_positions(["a", "b", "c"])
# == {"a": 0, "b": 1, "c": 2}

For more detail and additional functions, check the API docs mentioned earlier in the [usage section](#usage).
For more detail and additional functions, check the API docs mentioned earlier in the
[usage section](#usage).

## Retrieving files

When you download a file, you can opt to download from the web as usual or download from the web and and save for future
reuse or use the previously downloaded file. The advantage is this is all handled in the class so you don't need to do
lots of if-else conditions for the different cases for each download in your code. This is helpful for example when
trying to generate test data.
When you download a file, you can opt to download from the web as usual or download from
the web and and save for future reuse or use the previously downloaded file. The
advantage is this is all handled in the class so you don't need to do lots of if-else
conditions for the different cases for each download in your code. This is helpful for
example when trying to generate test data.

All the downloads in your code can be switched between the different modes by setting the save and use_saved flags when
All the downloads in your code can be switched between the different modes by setting
the save and use_saved flags when
constructing the Retrieve object.

retriever = Retrieve(downloader, fallback_dir, saved_dir, temp_dir, save, use_saved)

- `save=False, use_saved=False` - download from web as normal (files will go in temp_folder and be discarded)
- `save=True, use_saved=False` - download from web as normal (files will go in saved_dir and will be kept)
- `save=False, use_saved=False` - download from web as normal (files will go in
temp_folder and be discarded)
- `save=True, use_saved=False` - download from web as normal (files will go in saved_dir
and will be kept)
- `save=False, use_saved=True` - use files from saved_dir (don't download at all)

fallback_dir is a folder containing static fallback files which can optionally be used if the download fails.
fallback_dir is a folder containing static fallback files which can optionally be used
if the download fails.

Methods in the Retrieve class are:
- `retrieve_file` returns a path to a file
Expand Down
4 changes: 2 additions & 2 deletions src/hdx/utilities/base_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def download_json(self, url: str, *args: Any, **kwargs: Any) -> Any:
@abstractmethod
def get_tabular_rows(
self,
url: str,
url: Union[str, ListTuple[str]],
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
dict_form: bool = False,
*args: Any,
Expand All @@ -112,7 +112,7 @@ def get_tabular_rows(
or a list, defaulting to a list.
Args:
url (str): URL or path to read from
url (Union[str, ListTuple[str]]): A single or list of URLs or paths to read from
headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
dict_form (bool): Return dict or list for each row. Defaults to False (list)
*args (Any): Positional arguments
Expand Down
118 changes: 114 additions & 4 deletions src/hdx/utilities/downloader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Downloading utilities for urls"""
import copy
import hashlib
import logging
from copy import deepcopy
from os import remove
from os.path import exists, isfile, join, split, splitext
from pathlib import Path
Expand Down Expand Up @@ -594,7 +594,7 @@ def get_frictionless_resource(
raise DownloadError(str(e)) from e
return self.response

def get_tabular_rows(
def _get_tabular_rows(
self,
url: str,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
Expand Down Expand Up @@ -668,7 +668,7 @@ def get_tabular_rows(
if header_insertions is None or origheaders is None:
headers = origheaders
else:
headers = copy.deepcopy(origheaders)
headers = deepcopy(origheaders)
for position, header in header_insertions:
headers.insert(position, header)

Expand All @@ -691,6 +691,116 @@ def get_next():

return headers, get_next()

def get_tabular_rows(
self,
url: Union[str, ListTuple[str]],
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
dict_form: bool = False,
include_headers: bool = False,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
row_function: Optional[
Callable[[List[str], ListDict], ListDict]
] = None,
has_hxl: bool = False,
**kwargs: Any,
) -> Tuple[List[str], Iterator[ListDict]]:
"""Returns header of tabular file(s) pointed to by url and an iterator where
each row is returned as a list or dictionary depending on the dict_rows argument.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the
files are HXLated so that the HXL row is only included from the first file.
The headers argument is either a row number or list of row numbers (in case of
multi-line headers) to be considered as headers (rows start counting at 1), or
the actual headers defined as a list of strings. It defaults to 1. The dict_form
argument specifies if each row should be returned as a dictionary or a list,
defaulting to a list.
Optionally, headers can be inserted at specific positions. This is achieved
using the header_insertions argument. If supplied, it is a list of tuples of the
form (position, header) to be inserted. A function is called for each row. If
supplied, it takes as arguments: headers (prior to any insertions) and row
(which will be in dict or list form depending upon the dict_rows argument) and
outputs a modified row or None to ignore the row.
Args:
url (Union[str, ListTuple[str]]): A single or list of URLs or paths to read from
headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
dict_form (bool): Return dict or list for each row. Defaults to False (list)
include_headers (bool): Whether to include headers in iterator. Defaults to False.
ignore_blank_rows (bool): Whether to ignore blank rows. Defaults to True.
infer_types (bool): Whether to infer types. Defaults to False (strings).
header_insertions (Optional[ListTuple[Tuple[int,str]]]): List of (position, header) to insert. Defaults to None.
row_function (Optional[Callable[[List[str],ListDict],ListDict]]): Function to call for each row. Defaults to None.
has_hxl (bool): Whether files have HXL hashtags. Ignored for single url. Defaults to False.
**kwargs:
format (Optional[str]): Type of file. Defaults to inferring.
file_type (Optional[str]): Type of file. Defaults to inferring.
encoding (Optional[str]): Type of encoding. Defaults to inferring.
compression (Optional[str]): Type of compression. Defaults to inferring.
delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
skip_initial_space (bool): Ignore whitespace straight after delimiter. Defaults to False.
sheet (Optional[Union[int, str]): Sheet in Excel. Defaults to inferring.
fill_merged_cells (bool): Whether to fill merged cells. Defaults to True.
http_session (Session): Session object to use. Defaults to downloader session.
columns (Union[ListTuple[int], ListTuple[str], None]): Columns to pick. Defaults to all.
default_type (Optional[str]): Default field type if infer_types False. Defaults to string.
float_numbers (bool): Use float not Decimal if infer_types True. Defaults to True.
null_values (List[Any]): Values that will return None. Defaults to [""].
dialect (Dialect): This can be set to override the above. See Frictionless docs.
detector (Detector): This can be set to override the above. See Frictionless docs.
layout (Layout): This can be set to override the above. See Frictionless docs.
schema (Schema): This can be set to override the above. See Frictionless docs.
Returns:
Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
"""
if isinstance(url, list):
is_list = True
orig_kwargs = deepcopy(kwargs)
urls = url
url = urls[0]
else:
is_list = False
outheaders, iterator1 = self._get_tabular_rows(
url,
headers,
dict_form,
include_headers,
ignore_blank_rows,
infer_types,
header_insertions,
row_function,
**kwargs,
)
if not is_list:
return outheaders, iterator1

def make_iterator():
for row in iterator1:
yield row
for url in urls[1:]:
temp_kwargs = deepcopy(orig_kwargs)
_, iterator = self._get_tabular_rows(
url,
headers,
dict_form,
include_headers,
ignore_blank_rows,
infer_types,
header_insertions,
row_function,
**temp_kwargs,
)
if has_hxl:
next(iterator)
for row in iterator:
yield row

return outheaders, make_iterator()

def get_tabular_rows_as_list(
self,
url: str,
Expand Down Expand Up @@ -1100,7 +1210,7 @@ def generate_downloaders(

cls.downloaders = {"default": cls(**kwargs)}
for name in custom_configs:
args_copy = copy.deepcopy(kwargs)
args_copy = deepcopy(kwargs)
args_copy.update(custom_configs[name])
cls.downloaders[name] = cls(**args_copy)

Expand Down
90 changes: 26 additions & 64 deletions src/hdx/utilities/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,26 +386,31 @@ def download_json(

def get_tabular_rows(
self,
url: str,
url: Union[str, ListTuple[str]],
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
dict_form: bool = False,
has_hxl: bool = False,
filename: Optional[str] = None,
logstr: Optional[str] = None,
fallback: bool = False,
**kwargs: Any,
) -> Tuple[List[str], Iterator[ListDict]]:
"""Returns header of tabular file pointed to by url and an iterator where each
row is returned as a list or dictionary depending on the dict_rows argument.
"""Returns header of tabular file(s) pointed to by url and an iterator where
each row is returned as a list or dictionary depending on the dict_rows argument.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the
files are HXLated so that the HXL row is only included from the first file.
The headers argument is either a row number or list of row numbers (in case of
multi-line headers) to be considered as headers (rows start counting at 1), or
the actual headers defined as a list of strings. It defaults to 1.
The dict_form arguments specifies if each row should be returned as a dictionary
or a list, defaulting to a list.
Args:
url (str): URL or path to read from
url (Union[str, ListTuple[str]]): A single or list of URLs or paths to read from
headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
dict_form (bool): Return dict or list for each row. Defaults to False (list)
has_hxl (bool): Whether files have HXL hashtags. Defaults to False.
filename (Optional[str]): Filename of saved file. Defaults to getting from url.
logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
fallback (bool): Whether to use static fallback if download fails. Defaults to False.
Expand All @@ -415,71 +420,28 @@ def get_tabular_rows(
Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
"""
if isinstance(url, list):
is_list = True
orig_kwargs = deepcopy(kwargs)
urls = url
url = urls[0]
else:
is_list = False
path = self.download_file(url, filename, logstr, fallback, **kwargs)
kwargs.pop("file_prefix", None)
return self.downloader.get_tabular_rows(
path, headers, dict_form, **kwargs
)

def get_tabular_rows_multi_url(
self,
urls: ListTuple[str],
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
dict_form: bool = False,
has_hxl: bool = False,
logstr: Optional[str] = None,
fallback: bool = False,
**kwargs: Any,
) -> Tuple[List[str], Iterator[ListDict]]:
"""Returns header of tabular file pointed to by url and an iterator where each
row is returned as a list or dictionary depending on the dict_rows argument.
The headers argument is either a row number or list of row numbers (in case of
multi-line headers) to be considered as headers (rows start counting at 1), or
the actual headers defined as a list of strings. It defaults to 1.
The dict_form arguments specifies if each row should be returned as a dictionary
or a list, defaulting to a list.
Args:
urls (ListTuple[str]): URLs or paths to read from
headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
dict_form (bool): Return dict or list for each row. Defaults to False (list)
has_hxl (bool): Whether files have HXL hashtags. Defaults to False.
logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
fallback (bool): Whether to use static fallback if download fails. Defaults to False.
**kwargs: Parameters to pass to download_file call
Returns:
Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
if is_list:
path = [path]
for url in urls[1:]:
temp_kwargs = deepcopy(orig_kwargs)
pth = self.download_file(
url, None, logstr, fallback, **temp_kwargs
)
path.append(pth)

"""
paths = list()
for url in urls:
temp_kwargs = deepcopy(kwargs)
path = self.download_file(
url, None, logstr, fallback, **temp_kwargs
)
paths.append(path)
kwargs.pop("file_prefix", None)
temp_kwargs = deepcopy(kwargs)
outheaders, iterator1 = self.downloader.get_tabular_rows(
paths[0], headers, dict_form, **temp_kwargs
return self.downloader.get_tabular_rows(
path, headers, dict_form, has_hxl=has_hxl, **kwargs
)

def make_iterator():
for row in iterator1:
yield row
for path in paths[1:]:
temp_kwargs = deepcopy(kwargs)
_, iterator = self.downloader.get_tabular_rows(
path, headers, dict_form, **temp_kwargs
)
if has_hxl:
next(iterator)
for row in iterator:
yield row

return outheaders, make_iterator()

@classmethod
def generate_retrievers(
cls,
Expand Down
4 changes: 2 additions & 2 deletions tests/hdx/utilities/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def test_get_tabular_rows_multi_url(
) as retriever:
filename = "test.csv"
url = join(retrieverfolder, filename)
headers, iterator = retriever.get_tabular_rows_multi_url(
headers, iterator = retriever.get_tabular_rows(
[url, url], logstr="test file", fallback=False
)
assert headers == ["header1", "header2", "header3", "header4"]
Expand All @@ -281,7 +281,7 @@ def test_get_tabular_rows_multi_url(
]
filename = "test_hxl.csv"
url = join(retrieverfolder, filename)
headers, iterator = retriever.get_tabular_rows_multi_url(
headers, iterator = retriever.get_tabular_rows(
[url, url],
has_hxl=True,
logstr="test file",
Expand Down

0 comments on commit 57a951c

Please sign in to comment.