From 2647fc9fdbbbef9587a3b969f506d9fe75cd0835 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Sun, 31 Mar 2024 15:36:11 -0400 Subject: [PATCH] [feat] port jsonpath from JSONLab, add loadurl for REST API --- jdata/__init__.py | 4 ++ jdata/jfile.py | 39 +++++++++-- jdata/jpath.py | 165 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 203 insertions(+), 5 deletions(-) create mode 100644 jdata/jpath.py diff --git a/jdata/__init__.py b/jdata/__init__.py index d39ab89..563ecc6 100644 --- a/jdata/__init__.py +++ b/jdata/__init__.py @@ -36,6 +36,7 @@ from .jfile import ( load, save, + loadurl, show, loadt, savet, @@ -48,11 +49,13 @@ jext, ) from .jdata import encode, decode, jdtype, jsonfilter +from .jpath import jsonpath __version__ = "0.5.5" __all__ = [ "load", "save", + "loadurl", "show", "loadt", "savet", @@ -67,6 +70,7 @@ "jdtype", "jsonfilter", "jext", + "jsonpath", ] __license__ = """Apache license 2.0, Copyright (c) 2019-2024 Qianqian Fang""" diff --git a/jdata/jfile.py b/jdata/jfile.py index 2cb94b1..1f8d10c 100644 --- a/jdata/jfile.py +++ b/jdata/jfile.py @@ -7,6 +7,7 @@ __all__ = [ "load", "save", + "loadurl", "show", "loadt", "savet", @@ -54,7 +55,7 @@ def load(fname, opt={}, **kwargs): """ if re.match("^https*://", fname): newdata = downloadlink(fname, opt, **kwargs) - return newdata + return newdata[0] spl = os.path.splitext(fname) ext = spl[1].lower() @@ -102,6 +103,24 @@ def save(data, fname, opt={}, **kwargs): ) +def loadurl(url, opt={}, **kwargs): + """@brief Loading a JData file (binary or text) from a URL without caching locally + + @param[in] url: a REST API URL, curently only support http:// and https:// + @param[in] opt: options, opt['nocache']=True by default, setting to False download and locally cache the data + """ + opt.setdefault("nocache", True) + + if re.match("^https*://", url): + newdata = downloadlink(url, opt, **kwargs) + return newdata[0] + else: + raise Exception( + "JData", + "input to loadurl is not a valid URL", + ) + + ##==================================================================================== ## Loading and saving text-based JData (i.e. JSON) files ##==================================================================================== @@ -370,7 +389,6 @@ def jsoncache(url, opt={}, **kwargs): if p is not None: cachepath.insert(0, p) elif dbname and docname: - print([domain, dbname, docname, cachepath]) cachepath = [os.path.join(x, domain, dbname, docname) for x in cachepath] if filename is not None: for i in range(len(cachepath)): @@ -421,7 +439,7 @@ def jdlink(uripath, opt={}, **kwargs): ) alloutput = [[] for _ in range(3)] for i in range(len(uripath)): - newdata, fname, cachepath = downloadlink(uripath[i], opt) + newdata, fname, cachepath = downloadlink(uripath[i], opt, **kwargs) alloutput[0].append(newdata) alloutput[1].append(fname) alloutput[2].append(cachepath) @@ -429,13 +447,24 @@ def jdlink(uripath, opt={}, **kwargs): alloutput = [x[0] for x in alloutput] newdata, fname, cachepath = tuple(alloutput) elif isinstance(uripath, str): - newdata, fname, cachepath = downloadlink(uripath, opt) + newdata, fname, cachepath = downloadlink(uripath, opt, **kwargs) return newdata, fname -def downloadlink(uripath, opt={}): +def downloadlink(uripath, opt={}, **kwargs): opt.setdefault("showlink", 1) + if "nocache" in opt and opt["nocache"]: + newdata = urllib.request.urlopen(uripath).read() + try: + newdata = loadts(newdata, opt, **kwargs) + except: + try: + newdata = loadbs(newdata, opt, **kwargs) + except: + pass + return newdata, uripath, None + newdata = [] cachepath, filename = jsoncache(uripath) if isinstance(cachepath, list) and cachepath: diff --git a/jdata/jpath.py b/jdata/jpath.py new file mode 100644 index 0000000..468764b --- /dev/null +++ b/jdata/jpath.py @@ -0,0 +1,165 @@ +"""@package docstring +JSONPath implementation ported from the jsonpath MATLAB function in JSONLab + +Copyright (c) 2019-2024 Qianqian Fang +""" + +__all__ = [ + "jsonpath", +] + +##==================================================================================== +## dependent libraries +##==================================================================================== + + +import re +import json +import copy + + +def jsonpath(root, jpath, opt={}): + + obj = root + jpath = re.sub(r"([^.\]])(\[[-0-9:\*]+\])", r"\1.\2", jpath) + jpath = re.sub(r"\[[\'\"]*([^]\'\"]+)[\'\"]*\]", r".[\1]", jpath) + jpath = re.sub(r"\\.", "_0x2E_", jpath) + while re.search(r"(\[[\'\"]*[^]\'\"]+)\.(?=[^]\'\"]+[\'\"]*\])", jpath): + jpath = re.sub( + r"(\[[\'\"]*[^]\'\"]+)\.(?=[^]\'\"]+[\'\"]*\])", r"\1_0x2E_", jpath + ) + + paths = re.findall(r"(\.{0,2}[^.]+)", jpath) + paths = [re.sub("_0x2E_", ".", x) for x in paths] + if paths and paths[0] == "$": + paths.pop(0) + + for i, path in enumerate(paths): + obj, isfound = getonelevel(obj, paths, i, opt) + if not isfound: + return None + return obj + + +def getonelevel(input_data, paths, pathid, opt): + + opt.setdefault("inplace", False) + + pathname = paths[pathid] + if isinstance(pathname, list): + pathname = pathname[0] + deepscan = bool(re.search(r"^\.\.", pathname)) + origpath = pathname + pathname = re.sub(r"^\.+", "", pathname) + obj = None + isfound = False + + if pathname == "$": + obj = input_data + elif re.match(r"\$\d+", pathname): + obj = input_data[int(pathname[2:]) + 1] + elif re.match(r"^\[[\-0-9\*:]+\]$", pathname) or isinstance( + input_data, (list, tuple, frozenset) + ): + arraystr = pathname[1:-1] + arrayrange = {"start": None, "end": None} + + if ":" in arraystr: + match = re.search(r"(?P-*\d*):(?P-*\d*)", arraystr) + if match: + arrayrange["start"] = ( + int(match.group("start")) if match.group("start") else None + ) + arrayrange["end"] = ( + int(match.group("end")) if match.group("end") else None + ) + + if arrayrange["start"] is not None: + if arrayrange["start"] < 0: + arrayrange["start"] = len(input_data) + arrayrange["start"] + else: + arrayrange["start"] += 1 + else: + arrayrange["start"] = 1 + + if arrayrange["end"] is not None: + if arrayrange["end"] < 0: + arrayrange["end"] = len(input_data) + arrayrange["end"] + else: + arrayrange["end"] += 1 + else: + arrayrange["end"] = len(input_data) + elif re.match(r"^[-0-9:]+$", arraystr): + firstidx = int(arraystr) + if firstidx < 0: + firstidx = len(input_data) + firstidx + 1 + else: + firstidx += 1 + arrayrange["start"] = arrayrange["end"] = firstidx + elif re.match(r"^\*$", arraystr): + pass + + if ( + "arrayrange" in locals() + and arrayrange["start"] is not None + and arrayrange["end"] is not None + ): + obj = input_data[arrayrange["start"] - 1 : arrayrange["end"]] + else: + arrayrange = {"start": 1, "end": len(input_data)} + + if not obj and isinstance(input_data, list): + input_data = input_data[arrayrange["start"] - 1 : arrayrange["end"]] + searchkey = ".." + pathname if deepscan else origpath + newobj = [] + for idx, item in enumerate(input_data): + val, isfound = getonelevel( + item, paths[:pathid] + [searchkey], pathid, opt + ) + if isfound: + newobj.extend(val) + if newobj: + obj = newobj + if isinstance(obj, list) and len(obj) == 1: + obj = obj[0] + + elif isinstance(input_data, dict): + pathname = re.sub(r"^\[(.*)\]$", r"\1", pathname) + stpath = pathname + + if stpath in input_data: + obj = [input_data[stpath]] + + deepscan = False + if obj is None or deepscan: + items = input_data.keys() + + for idx in items: + val, isfound = getonelevel( + input_data[idx], paths[:pathid] + [[".." + pathname]], pathid, opt + ) + if isfound: + obj = obj or [] + if isinstance(val, list): + obj.extend(val) + else: + obj.append(val) + + if obj and len(obj) == 1: + obj = obj[0] + + if isinstance(obj, list) and len(obj) == 1: + obj = obj[0] + + elif not deepscan: + raise ValueError( + f'json path segment "{pathname}" can not be found in the input_data object' + ) + + if obj is None: + isfound = False + obj = [] + else: + isfound = True + + return (copy.deepcopy(obj), isfound) if opt["inplace"] else (obj, isfound)