get_tabular_rows can accept a list of urls in url

OCHA-DAP · Sep 22, 2022 · 57a951c · 57a951c
1 parent 409541a
commit 57a951c
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 81 deletions.
diff --git a/doc/main.md b/doc/main.md
@@ -67,6 +67,9 @@ Arguments about choosing between dict and list are all made consistent - dict_fo
 ## Downloading files
 
 Various utilities to help with downloading files. Includes retrying by default.
+The `Download` class inherits from `BaseDownload` which specifies a number of standard
+methods that all downloaders should have: `download_file`, `download_text`, 
+`download_yaml`, `download_json` and `get_tabular_rows`.
 
 For example, given YAML file extraparams.yml:
 
@@ -163,25 +166,31 @@ Other useful functions:
     Download.get_column_positions(["a", "b", "c"])
     # == {"a": 0, "b": 1, "c": 2}
 
-For more detail and additional functions, check the API docs mentioned earlier in the [usage section](#usage).
+For more detail and additional functions, check the API docs mentioned earlier in the 
+[usage section](#usage).
 
 ## Retrieving files
 
-When you download a file, you can opt to download from the web as usual or download from the web and and save for future
-reuse or use the previously downloaded file. The advantage is this is all handled in the class so you don't need to do 
-lots of if-else conditions for the different cases for each download in your code. This is helpful for example when 
-trying to generate test data. 
+When you download a file, you can opt to download from the web as usual or download from 
+the web and and save for future reuse or use the previously downloaded file. The 
+advantage is this is all handled in the class so you don't need to do lots of if-else 
+conditions for the different cases for each download in your code. This is helpful for 
+example when trying to generate test data. 
 
-All the downloads in your code can be switched between the different modes by setting the save and use_saved flags when 
+All the downloads in your code can be switched between the different modes by setting 
+the save and use_saved flags when 
 constructing the Retrieve object.
 
     retriever = Retrieve(downloader, fallback_dir, saved_dir, temp_dir, save, use_saved)
 
-- `save=False, use_saved=False`  - download from web as normal (files will go in temp_folder and be discarded)
-- `save=True, use_saved=False` - download from web as normal (files will go in saved_dir and will be kept)
+- `save=False, use_saved=False`  - download from web as normal (files will go in 
+temp_folder and be discarded)
+- `save=True, use_saved=False` - download from web as normal (files will go in saved_dir 
+and will be kept)
 - `save=False, use_saved=True` - use files from saved_dir (don't download at all)
 
-fallback_dir is a folder containing static fallback files which can optionally be used if the download fails.
+fallback_dir is a folder containing static fallback files which can optionally be used 
+if the download fails.
 
 Methods in the Retrieve class are: 
 - `retrieve_file` returns a path to a file

diff --git a/src/hdx/utilities/base_downloader.py b/src/hdx/utilities/base_downloader.py
@@ -97,7 +97,7 @@ def download_json(self, url: str, *args: Any, **kwargs: Any) -> Any:
     @abstractmethod
     def get_tabular_rows(
         self,
-        url: str,
+        url: Union[str, ListTuple[str]],
         headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
         dict_form: bool = False,
         *args: Any,
@@ -112,7 +112,7 @@ def get_tabular_rows(
         or a list, defaulting to a list.
 
         Args:
-            url (str): URL or path to read from
+            url (Union[str, ListTuple[str]]): A single or list of URLs or paths to read from
             headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
             dict_form (bool): Return dict or list for each row. Defaults to False (list)
             *args (Any): Positional arguments

diff --git a/src/hdx/utilities/downloader.py b/src/hdx/utilities/downloader.py
@@ -1,7 +1,7 @@
 """Downloading utilities for urls"""
-import copy
 import hashlib
 import logging
+from copy import deepcopy
 from os import remove
 from os.path import exists, isfile, join, split, splitext
 from pathlib import Path
@@ -594,7 +594,7 @@ def get_frictionless_resource(
             raise DownloadError(str(e)) from e
         return self.response
 
-    def get_tabular_rows(
+    def _get_tabular_rows(
         self,
         url: str,
         headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
@@ -668,7 +668,7 @@ def get_tabular_rows(
         if header_insertions is None or origheaders is None:
             headers = origheaders
         else:
-            headers = copy.deepcopy(origheaders)
+            headers = deepcopy(origheaders)
             for position, header in header_insertions:
                 headers.insert(position, header)
 
@@ -691,6 +691,116 @@ def get_next():
 
         return headers, get_next()
 
+    def get_tabular_rows(
+        self,
+        url: Union[str, ListTuple[str]],
+        headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
+        dict_form: bool = False,
+        include_headers: bool = False,
+        ignore_blank_rows: bool = True,
+        infer_types: bool = False,
+        header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
+        row_function: Optional[
+            Callable[[List[str], ListDict], ListDict]
+        ] = None,
+        has_hxl: bool = False,
+        **kwargs: Any,
+    ) -> Tuple[List[str], Iterator[ListDict]]:
+        """Returns header of tabular file(s) pointed to by url and an iterator where
+        each row is returned as a list or dictionary depending on the dict_rows argument.
+        When a list of urls is supplied (in url), then the has_hxl flag indicates if the
+        files are HXLated so that the HXL row is only included from the first file.
+
+        The headers argument is either a row number or list of row numbers (in case of
+        multi-line headers) to be considered as headers (rows start counting at 1), or
+        the actual headers defined as a list of strings. It defaults to 1. The dict_form
+        argument specifies if each row should be returned as a dictionary or a list,
+        defaulting to a list.
+
+        Optionally, headers can be inserted at specific positions. This is achieved
+        using the header_insertions argument. If supplied, it is a list of tuples of the
+        form (position, header) to be inserted. A function is called for each row. If
+        supplied, it takes as arguments: headers (prior to any insertions) and row
+        (which will be in dict or list form depending upon the dict_rows argument) and
+        outputs a modified row or None to ignore the row.
+
+        Args:
+            url (Union[str, ListTuple[str]]): A single or list of URLs or paths to read from
+            headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
+            dict_form (bool): Return dict or list for each row. Defaults to False (list)
+            include_headers (bool): Whether to include headers in iterator. Defaults to False.
+            ignore_blank_rows (bool): Whether to ignore blank rows. Defaults to True.
+            infer_types (bool): Whether to infer types. Defaults to False (strings).
+            header_insertions (Optional[ListTuple[Tuple[int,str]]]): List of (position, header) to insert. Defaults to None.
+            row_function (Optional[Callable[[List[str],ListDict],ListDict]]): Function to call for each row. Defaults to None.
+            has_hxl (bool): Whether files have HXL hashtags. Ignored for single url. Defaults to False.
+            **kwargs:
+            format (Optional[str]): Type of file. Defaults to inferring.
+            file_type (Optional[str]): Type of file. Defaults to inferring.
+            encoding (Optional[str]): Type of encoding. Defaults to inferring.
+            compression (Optional[str]): Type of compression. Defaults to inferring.
+            delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
+            skip_initial_space (bool): Ignore whitespace straight after delimiter. Defaults to False.
+            sheet (Optional[Union[int, str]): Sheet in Excel. Defaults to inferring.
+            fill_merged_cells (bool): Whether to fill merged cells. Defaults to True.
+            http_session (Session): Session object to use. Defaults to downloader session.
+            columns (Union[ListTuple[int], ListTuple[str], None]): Columns to pick. Defaults to all.
+            default_type (Optional[str]): Default field type if infer_types False. Defaults to string.
+            float_numbers (bool): Use float not Decimal if infer_types True. Defaults to True.
+            null_values (List[Any]): Values that will return None. Defaults to [""].
+            dialect (Dialect): This can be set to override the above. See Frictionless docs.
+            detector (Detector): This can be set to override the above. See Frictionless docs.
+            layout (Layout): This can be set to override the above. See Frictionless docs.
+            schema (Schema): This can be set to override the above. See Frictionless docs.
+
+        Returns:
+            Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
+
+        """
+        if isinstance(url, list):
+            is_list = True
+            orig_kwargs = deepcopy(kwargs)
+            urls = url
+            url = urls[0]
+        else:
+            is_list = False
+        outheaders, iterator1 = self._get_tabular_rows(
+            url,
+            headers,
+            dict_form,
+            include_headers,
+            ignore_blank_rows,
+            infer_types,
+            header_insertions,
+            row_function,
+            **kwargs,
+        )
+        if not is_list:
+            return outheaders, iterator1
+
+        def make_iterator():
+            for row in iterator1:
+                yield row
+            for url in urls[1:]:
+                temp_kwargs = deepcopy(orig_kwargs)
+                _, iterator = self._get_tabular_rows(
+                    url,
+                    headers,
+                    dict_form,
+                    include_headers,
+                    ignore_blank_rows,
+                    infer_types,
+                    header_insertions,
+                    row_function,
+                    **temp_kwargs,
+                )
+                if has_hxl:
+                    next(iterator)
+                for row in iterator:
+                    yield row
+
+        return outheaders, make_iterator()
+
     def get_tabular_rows_as_list(
         self,
         url: str,
@@ -1100,7 +1210,7 @@ def generate_downloaders(
 
         cls.downloaders = {"default": cls(**kwargs)}
         for name in custom_configs:
-            args_copy = copy.deepcopy(kwargs)
+            args_copy = deepcopy(kwargs)
             args_copy.update(custom_configs[name])
             cls.downloaders[name] = cls(**args_copy)
 

diff --git a/src/hdx/utilities/retriever.py b/src/hdx/utilities/retriever.py
@@ -386,26 +386,31 @@ def download_json(
 
     def get_tabular_rows(
         self,
-        url: str,
+        url: Union[str, ListTuple[str]],
         headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
         dict_form: bool = False,
+        has_hxl: bool = False,
         filename: Optional[str] = None,
         logstr: Optional[str] = None,
         fallback: bool = False,
         **kwargs: Any,
     ) -> Tuple[List[str], Iterator[ListDict]]:
-        """Returns header of tabular file pointed to by url and an iterator where each
-        row is returned as a list or dictionary depending on the dict_rows argument.
+        """Returns header of tabular file(s) pointed to by url and an iterator where
+        each row is returned as a list or dictionary depending on the dict_rows argument.
+        When a list of urls is supplied (in url), then the has_hxl flag indicates if the
+        files are HXLated so that the HXL row is only included from the first file.
+
         The headers argument is either a row number or list of row numbers (in case of
         multi-line headers) to be considered as headers (rows start counting at 1), or
         the actual headers defined as a list of strings. It defaults to 1.
         The dict_form arguments specifies if each row should be returned as a dictionary
         or a list, defaulting to a list.
 
         Args:
-            url (str): URL or path to read from
+            url (Union[str, ListTuple[str]]): A single or list of URLs or paths to read from
             headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
             dict_form (bool): Return dict or list for each row. Defaults to False (list)
+            has_hxl (bool): Whether files have HXL hashtags. Defaults to False.
             filename (Optional[str]): Filename of saved file. Defaults to getting from url.
             logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
             fallback (bool): Whether to use static fallback if download fails. Defaults to False.
@@ -415,71 +420,28 @@ def get_tabular_rows(
             Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
 
         """
+        if isinstance(url, list):
+            is_list = True
+            orig_kwargs = deepcopy(kwargs)
+            urls = url
+            url = urls[0]
+        else:
+            is_list = False
         path = self.download_file(url, filename, logstr, fallback, **kwargs)
-        kwargs.pop("file_prefix", None)
-        return self.downloader.get_tabular_rows(
-            path, headers, dict_form, **kwargs
-        )
-
-    def get_tabular_rows_multi_url(
-        self,
-        urls: ListTuple[str],
-        headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
-        dict_form: bool = False,
-        has_hxl: bool = False,
-        logstr: Optional[str] = None,
-        fallback: bool = False,
-        **kwargs: Any,
-    ) -> Tuple[List[str], Iterator[ListDict]]:
-        """Returns header of tabular file pointed to by url and an iterator where each
-        row is returned as a list or dictionary depending on the dict_rows argument.
-        The headers argument is either a row number or list of row numbers (in case of
-        multi-line headers) to be considered as headers (rows start counting at 1), or
-        the actual headers defined as a list of strings. It defaults to 1.
-        The dict_form arguments specifies if each row should be returned as a dictionary
-        or a list, defaulting to a list.
-
-        Args:
-            urls (ListTuple[str]): URLs or paths to read from
-            headers (Union[int, ListTuple[int], ListTuple[str]]): Number of row(s) containing headers or list of headers. Defaults to 1.
-            dict_form (bool): Return dict or list for each row. Defaults to False (list)
-            has_hxl (bool): Whether files have HXL hashtags. Defaults to False.
-            logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
-            fallback (bool): Whether to use static fallback if download fails. Defaults to False.
-            **kwargs: Parameters to pass to download_file call
-
-        Returns:
-            Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
+        if is_list:
+            path = [path]
+            for url in urls[1:]:
+                temp_kwargs = deepcopy(orig_kwargs)
+                pth = self.download_file(
+                    url, None, logstr, fallback, **temp_kwargs
+                )
+                path.append(pth)
 
-        """
-        paths = list()
-        for url in urls:
-            temp_kwargs = deepcopy(kwargs)
-            path = self.download_file(
-                url, None, logstr, fallback, **temp_kwargs
-            )
-            paths.append(path)
         kwargs.pop("file_prefix", None)
-        temp_kwargs = deepcopy(kwargs)
-        outheaders, iterator1 = self.downloader.get_tabular_rows(
-            paths[0], headers, dict_form, **temp_kwargs
+        return self.downloader.get_tabular_rows(
+            path, headers, dict_form, has_hxl=has_hxl, **kwargs
         )
 
-        def make_iterator():
-            for row in iterator1:
-                yield row
-            for path in paths[1:]:
-                temp_kwargs = deepcopy(kwargs)
-                _, iterator = self.downloader.get_tabular_rows(
-                    path, headers, dict_form, **temp_kwargs
-                )
-                if has_hxl:
-                    next(iterator)
-                for row in iterator:
-                    yield row
-
-        return outheaders, make_iterator()
-
     @classmethod
     def generate_retrievers(
         cls,

diff --git a/tests/hdx/utilities/test_retriever.py b/tests/hdx/utilities/test_retriever.py
@@ -269,7 +269,7 @@ def test_get_tabular_rows_multi_url(
             ) as retriever:
                 filename = "test.csv"
                 url = join(retrieverfolder, filename)
-                headers, iterator = retriever.get_tabular_rows_multi_url(
+                headers, iterator = retriever.get_tabular_rows(
                     [url, url], logstr="test file", fallback=False
                 )
                 assert headers == ["header1", "header2", "header3", "header4"]
@@ -281,7 +281,7 @@ def test_get_tabular_rows_multi_url(
                 ]
                 filename = "test_hxl.csv"
                 url = join(retrieverfolder, filename)
-                headers, iterator = retriever.get_tabular_rows_multi_url(
+                headers, iterator = retriever.get_tabular_rows(
                     [url, url],
                     has_hxl=True,
                     logstr="test file",