From 2647fc9fdbbbef9587a3b969f506d9fe75cd0835 Mon Sep 17 00:00:00 2001
From: Qianqian Fang <fangqq@gmail.com>
Date: Sun, 31 Mar 2024 15:36:11 -0400
Subject: [PATCH] [feat] port jsonpath from JSONLab, add loadurl for REST API

---
 jdata/__init__.py |   4 ++
 jdata/jfile.py    |  39 +++++++++--
 jdata/jpath.py    | 165 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 203 insertions(+), 5 deletions(-)
 create mode 100644 jdata/jpath.py

diff --git a/jdata/__init__.py b/jdata/__init__.py
index d39ab89..563ecc6 100644
--- a/jdata/__init__.py
+++ b/jdata/__init__.py
@@ -36,6 +36,7 @@
 from .jfile import (
     load,
     save,
+    loadurl,
     show,
     loadt,
     savet,
@@ -48,11 +49,13 @@
     jext,
 )
 from .jdata import encode, decode, jdtype, jsonfilter
+from .jpath import jsonpath
 
 __version__ = "0.5.5"
 __all__ = [
     "load",
     "save",
+    "loadurl",
     "show",
     "loadt",
     "savet",
@@ -67,6 +70,7 @@
     "jdtype",
     "jsonfilter",
     "jext",
+    "jsonpath",
 ]
 __license__ = """Apache license 2.0, Copyright (c) 2019-2024 Qianqian Fang"""
 
diff --git a/jdata/jfile.py b/jdata/jfile.py
index 2cb94b1..1f8d10c 100644
--- a/jdata/jfile.py
+++ b/jdata/jfile.py
@@ -7,6 +7,7 @@
 __all__ = [
     "load",
     "save",
+    "loadurl",
     "show",
     "loadt",
     "savet",
@@ -54,7 +55,7 @@ def load(fname, opt={}, **kwargs):
     """
     if re.match("^https*://", fname):
         newdata = downloadlink(fname, opt, **kwargs)
-        return newdata
+        return newdata[0]
 
     spl = os.path.splitext(fname)
     ext = spl[1].lower()
@@ -102,6 +103,24 @@ def save(data, fname, opt={}, **kwargs):
         )
 
 
+def loadurl(url, opt={}, **kwargs):
+    """@brief Loading a JData file (binary or text) from a URL without caching locally
+
+    @param[in] url: a REST API URL, curently only support http:// and https://
+    @param[in] opt: options, opt['nocache']=True by default, setting to False download and locally cache the data
+    """
+    opt.setdefault("nocache", True)
+
+    if re.match("^https*://", url):
+        newdata = downloadlink(url, opt, **kwargs)
+        return newdata[0]
+    else:
+        raise Exception(
+            "JData",
+            "input to loadurl is not a valid URL",
+        )
+
+
 ##====================================================================================
 ## Loading and saving text-based JData (i.e. JSON) files
 ##====================================================================================
@@ -370,7 +389,6 @@ def jsoncache(url, opt={}, **kwargs):
         if p is not None:
             cachepath.insert(0, p)
         elif dbname and docname:
-            print([domain, dbname, docname, cachepath])
             cachepath = [os.path.join(x, domain, dbname, docname) for x in cachepath]
         if filename is not None:
             for i in range(len(cachepath)):
@@ -421,7 +439,7 @@ def jdlink(uripath, opt={}, **kwargs):
             )
         alloutput = [[] for _ in range(3)]
         for i in range(len(uripath)):
-            newdata, fname, cachepath = downloadlink(uripath[i], opt)
+            newdata, fname, cachepath = downloadlink(uripath[i], opt, **kwargs)
             alloutput[0].append(newdata)
             alloutput[1].append(fname)
             alloutput[2].append(cachepath)
@@ -429,13 +447,24 @@ def jdlink(uripath, opt={}, **kwargs):
             alloutput = [x[0] for x in alloutput]
         newdata, fname, cachepath = tuple(alloutput)
     elif isinstance(uripath, str):
-        newdata, fname, cachepath = downloadlink(uripath, opt)
+        newdata, fname, cachepath = downloadlink(uripath, opt, **kwargs)
     return newdata, fname
 
 
-def downloadlink(uripath, opt={}):
+def downloadlink(uripath, opt={}, **kwargs):
     opt.setdefault("showlink", 1)
 
+    if "nocache" in opt and opt["nocache"]:
+        newdata = urllib.request.urlopen(uripath).read()
+        try:
+            newdata = loadts(newdata, opt, **kwargs)
+        except:
+            try:
+                newdata = loadbs(newdata, opt, **kwargs)
+            except:
+                pass
+        return newdata, uripath, None
+
     newdata = []
     cachepath, filename = jsoncache(uripath)
     if isinstance(cachepath, list) and cachepath:
diff --git a/jdata/jpath.py b/jdata/jpath.py
new file mode 100644
index 0000000..468764b
--- /dev/null
+++ b/jdata/jpath.py
@@ -0,0 +1,165 @@
+"""@package docstring
+JSONPath implementation ported from the jsonpath MATLAB function in JSONLab
+
+Copyright (c) 2019-2024 Qianqian Fang <q.fang at neu.edu>
+"""
+
+__all__ = [
+    "jsonpath",
+]
+
+##====================================================================================
+## dependent libraries
+##====================================================================================
+
+
+import re
+import json
+import copy
+
+
+def jsonpath(root, jpath, opt={}):
+
+    obj = root
+    jpath = re.sub(r"([^.\]])(\[[-0-9:\*]+\])", r"\1.\2", jpath)
+    jpath = re.sub(r"\[[\'\"]*([^]\'\"]+)[\'\"]*\]", r".[\1]", jpath)
+    jpath = re.sub(r"\\.", "_0x2E_", jpath)
+    while re.search(r"(\[[\'\"]*[^]\'\"]+)\.(?=[^]\'\"]+[\'\"]*\])", jpath):
+        jpath = re.sub(
+            r"(\[[\'\"]*[^]\'\"]+)\.(?=[^]\'\"]+[\'\"]*\])", r"\1_0x2E_", jpath
+        )
+
+    paths = re.findall(r"(\.{0,2}[^.]+)", jpath)
+    paths = [re.sub("_0x2E_", ".", x) for x in paths]
+    if paths and paths[0] == "$":
+        paths.pop(0)
+
+    for i, path in enumerate(paths):
+        obj, isfound = getonelevel(obj, paths, i, opt)
+        if not isfound:
+            return None
+    return obj
+
+
+def getonelevel(input_data, paths, pathid, opt):
+
+    opt.setdefault("inplace", False)
+
+    pathname = paths[pathid]
+    if isinstance(pathname, list):
+        pathname = pathname[0]
+    deepscan = bool(re.search(r"^\.\.", pathname))
+    origpath = pathname
+    pathname = re.sub(r"^\.+", "", pathname)
+    obj = None
+    isfound = False
+
+    if pathname == "$":
+        obj = input_data
+    elif re.match(r"\$\d+", pathname):
+        obj = input_data[int(pathname[2:]) + 1]
+    elif re.match(r"^\[[\-0-9\*:]+\]$", pathname) or isinstance(
+        input_data, (list, tuple, frozenset)
+    ):
+        arraystr = pathname[1:-1]
+        arrayrange = {"start": None, "end": None}
+
+        if ":" in arraystr:
+            match = re.search(r"(?P<start>-*\d*):(?P<end>-*\d*)", arraystr)
+            if match:
+                arrayrange["start"] = (
+                    int(match.group("start")) if match.group("start") else None
+                )
+                arrayrange["end"] = (
+                    int(match.group("end")) if match.group("end") else None
+                )
+
+                if arrayrange["start"] is not None:
+                    if arrayrange["start"] < 0:
+                        arrayrange["start"] = len(input_data) + arrayrange["start"]
+                    else:
+                        arrayrange["start"] += 1
+                else:
+                    arrayrange["start"] = 1
+
+                if arrayrange["end"] is not None:
+                    if arrayrange["end"] < 0:
+                        arrayrange["end"] = len(input_data) + arrayrange["end"]
+                    else:
+                        arrayrange["end"] += 1
+                else:
+                    arrayrange["end"] = len(input_data)
+        elif re.match(r"^[-0-9:]+$", arraystr):
+            firstidx = int(arraystr)
+            if firstidx < 0:
+                firstidx = len(input_data) + firstidx + 1
+            else:
+                firstidx += 1
+            arrayrange["start"] = arrayrange["end"] = firstidx
+        elif re.match(r"^\*$", arraystr):
+            pass
+
+        if (
+            "arrayrange" in locals()
+            and arrayrange["start"] is not None
+            and arrayrange["end"] is not None
+        ):
+            obj = input_data[arrayrange["start"] - 1 : arrayrange["end"]]
+        else:
+            arrayrange = {"start": 1, "end": len(input_data)}
+
+        if not obj and isinstance(input_data, list):
+            input_data = input_data[arrayrange["start"] - 1 : arrayrange["end"]]
+            searchkey = ".." + pathname if deepscan else origpath
+            newobj = []
+            for idx, item in enumerate(input_data):
+                val, isfound = getonelevel(
+                    item, paths[:pathid] + [searchkey], pathid, opt
+                )
+                if isfound:
+                    newobj.extend(val)
+            if newobj:
+                obj = newobj
+            if isinstance(obj, list) and len(obj) == 1:
+                obj = obj[0]
+
+    elif isinstance(input_data, dict):
+        pathname = re.sub(r"^\[(.*)\]$", r"\1", pathname)
+        stpath = pathname
+
+        if stpath in input_data:
+            obj = [input_data[stpath]]
+
+        deepscan = False
+        if obj is None or deepscan:
+            items = input_data.keys()
+
+            for idx in items:
+                val, isfound = getonelevel(
+                    input_data[idx], paths[:pathid] + [[".." + pathname]], pathid, opt
+                )
+                if isfound:
+                    obj = obj or []
+                    if isinstance(val, list):
+                        obj.extend(val)
+                    else:
+                        obj.append(val)
+
+            if obj and len(obj) == 1:
+                obj = obj[0]
+
+        if isinstance(obj, list) and len(obj) == 1:
+            obj = obj[0]
+
+    elif not deepscan:
+        raise ValueError(
+            f'json path segment "{pathname}" can not be found in the input_data object'
+        )
+
+    if obj is None:
+        isfound = False
+        obj = []
+    else:
+        isfound = True
+
+    return (copy.deepcopy(obj), isfound) if opt["inplace"] else (obj, isfound)