diff --git a/README.md b/README.md index 2530c81..c8c48f0 100644 --- a/README.md +++ b/README.md @@ -26,17 +26,25 @@ json_bytes = b''' ''' parser = cysimdjson.JSONParser() -json_parsed = parser.parse(json_bytes) +json_element = parser.parse(json_bytes) -# Standard Python access -print(json_parsed['foo'][2][0]) - -# Access using JSON Pointer (faster) -print(json_parsed.at_pointer("/foo/2/0")) +# Access using JSON Pointer +print(json_element.at_pointer("/foo/2/0")) ``` _Note: `parser` object can be reused for maximum performance._ + +### Pythonic drop-in API + +```python +parser = cysimdjson.JSONParser() +json_parsed = parser.loads(json_bytes) + +# Access using JSON Pointer +print(json_parsed.json_parsed['foo']) +``` + The `json_parsed` is a read-only dictionary-like object, that provides an access to JSON data. diff --git a/cysimdjson/cysimdjson.pyx b/cysimdjson/cysimdjson.pyx index ed2682d..211db18 100644 --- a/cysimdjson/cysimdjson.pyx +++ b/cysimdjson/cysimdjson.pyx @@ -90,6 +90,8 @@ cdef extern from "simdjson/simdjson.h" namespace "simdjson::dom": simdjson_array get_array() except +simdjson_error_handler simdjson_object get_object() except +simdjson_error_handler + simdjson_element at_pointer(const char*) except +simdjson_error_handler + cdef cppclass simdjson_parser "simdjson::dom::parser": @@ -125,15 +127,14 @@ cdef extern from "jsoninter.h": cdef const char * PyUnicode_AsUTF8AndSize(object, Py_ssize_t *) cdef simdjson_element extract_element(void *) + cdef size_t element_addrof(simdjson_element & value) cdef class JSONObject: - cdef simdjson_object Object - - - def __cinit__(JSONObject self): - pass + cdef: + simdjson_element Element + simdjson_object Object def __contains__(JSONObject self, key): @@ -155,7 +156,8 @@ cdef class JSONObject: sv = it.key() v = it.value() - yield string_view_to_python_string(sv), _wrap_element(v) + elem = JSONElement.from_element(v) + yield string_view_to_python_string(sv), elem.get_value() preincrement(it) @@ -165,7 +167,7 @@ cdef class JSONObject: key_raw = key.encode('utf-8') v = self.Object[key_raw] - return _wrap_element(v) + return JSONElement.from_element(v).get_value() def get(JSONObject self, str key, default=None): @@ -175,9 +177,8 @@ cdef class JSONObject: cdef bool found = object_get(self.Object, key_raw, v) if not found: return default - - return _wrap_element(v) - + return JSONElement.from_element(v).get_value() + def __len__(JSONObject self): return self.Object.size() @@ -196,7 +197,14 @@ cdef class JSONObject: def at_pointer(JSONObject self, key): key_raw = key.encode('utf-8') cdef simdjson_element v = self.Object.at_pointer(key_raw) - return _wrap_element(v) + return JSONElement.from_element(v).get_value() + + + def get_value(JSONElement self): + ''' + Get the python value + ''' + return self def export(self): @@ -207,12 +215,16 @@ cdef class JSONObject: return _export_object(self.Object) + def get_addr(JSONElement self): + return element_addrof(self.Element) + + cdef class JSONArray: - cdef simdjson_array Array + cdef: + simdjson_element Element + simdjson_array Array - def __cinit__(JSONArray self): - pass def __contains__(JSONArray self, item): # This is a full scan @@ -224,7 +236,8 @@ cdef class JSONArray: def __getitem__(JSONArray self, key: int): cdef simdjson_element v = self.Array.at(key) - return _wrap_element(v) + + return JSONElement.from_element(v).get_value() def __len__(JSONArray self): @@ -239,15 +252,22 @@ cdef class JSONArray: cdef simdjson_element element while it != it_end: - element = dereference(it) - yield _wrap_element(element) + elem = JSONElement.from_element(dereference(it)) + yield elem.get_value() preincrement(it) def at_pointer(JSONArray self, key): key_raw = key.encode('utf-8') cdef simdjson_element v = self.Array.at_pointer(key_raw) - return _wrap_element(v) + return JSONElement.from_element(v).get_value() + + + def get_value(JSONElement self): + ''' + Get the python value + ''' + return self def export(self): @@ -258,6 +278,121 @@ cdef class JSONArray: return _export_array(self.Array) + def get_addr(JSONElement self): + return element_addrof(self.Element) + + +cdef class JSONElement: + + cdef: + simdjson_element Element + + @staticmethod + cdef inline from_element(simdjson_element element): + ''' + This is the correct factory method + ''' + cdef simdjson_element_type et = element.type() + + if et == OBJECT: + new_object = JSONObject() + new_object.Element = element + new_object.Object = element.get_object() + return new_object + + elif et == ARRAY: + new_array = JSONArray() + new_array.Element = element + new_array.Array = element.get_array() + return new_array + + else: + new_element = JSONElement() + new_element.Element = element + return new_element + + + + def at_pointer(JSONElement self, key: str): + key_raw = key.encode('utf-8') + cdef simdjson_element v = self.Element.at_pointer(key_raw) + return JSONElement.from_element(v) + + + def get_value(JSONElement self): + return _get_element(self.Element) + + + def export(JSONElement self): + return _export_element(self.Element) + + + def get_addr(JSONElement self): + return element_addrof(self.Element) + + +cdef inline object _export_element(simdjson_element v): + cdef simdjson_element_type et = v.type() + + if et == OBJECT: + return _export_object(v.get_object()) + + elif et == ARRAY: + return _export_array(v.get_array()) + + elif et == STRING: + return element_to_py_string(v) + + elif et == INT64: + return v.get_int64() + + elif et == UINT64: + return v.get_uint64() + + elif et == DOUBLE: + return v.get_double() + + elif et == NULL_VALUE: + return None + + elif et == BOOL: + return v.get_bool() + + else: + raise ValueError("Unknown element type") + + +cdef inline object _get_element(simdjson_element v): + cdef simdjson_element_type et = v.type() + + if et == STRING: + return element_to_py_string(v) + + elif et == INT64: + return v.get_int64() + + elif et == UINT64: + return v.get_uint64() + + elif et == DOUBLE: + return v.get_double() + + elif et == NULL_VALUE: + return None + + elif et == BOOL: + return v.get_bool() + + elif et == OBJECT: + return _export_object(v.get_object()) + + elif et == ARRAY: + return _export_array(v.get_array()) + + else: + raise ValueError("Unknown element type") + + cdef class JSONParser: cdef: @@ -279,7 +414,7 @@ cdef class JSONParser: raise RuntimeError("Failed to get raw data") cdef simdjson_element element = self.Parser.parse(data_ptr, pysize, 1) - return _wrap_element(element) + return JSONElement.from_element(element) def parse_in_place(JSONParser self, event: bytes): @@ -294,7 +429,7 @@ cdef class JSONParser: raise RuntimeError("Failed to get raw data") cdef simdjson_element element = self.Parser.parse(data_ptr, pysize, 0) - return _wrap_element(element) + return JSONElement.from_element(element) def parse_string(JSONParser self, event: str): @@ -303,57 +438,47 @@ cdef class JSONParser: cdef const char * data_ptr = PyUnicode_AsUTF8AndSize(event, &pysize) cdef simdjson_element element = self.Parser.parse(data_ptr, pysize, 1) - return _wrap_element(element) + return JSONElement.from_element(element) + + + def load(JSONParser self, path: str): + ''' + This is a Pythonic API, as close to `json.load()` as possible/practical. + This means that the result of the load() is not the element but final value. + ''' + path_bytes = path.encode('utf-8') + cdef simdjson_element element = self.Parser.load(path_bytes) + return JSONElement.from_element(element).get_value() - def load(JSONParser self, path): - cdef simdjson_element element = self.Parser.load(path) - return _wrap_element(element) + def loads(JSONParser self, content: str): + ''' + This is a Pythonic API, as close to `jsons.load()` as possible/practical. + This means that the result of the loads() is not the element but final value. + ''' + path_bytes = content.encode('utf-8') + cdef simdjson_element element = self.Parser.parse(path_bytes, len(path_bytes), 1) + return JSONElement.from_element(element).get_value() def active_implementation(JSONParser self): return get_active_implementation() -# This method is used by C-level callers who want to wrap `simdjson::dom::element` into a cysimdjson object instance -cdef public api object cysimdjson_wrap_element(void * element): +cdef public api object cysimdjson_addr_to_element(void * element): + ''' + Used by C-level callers who want to wrap `simdjson::dom::element` + into a cysimdjson JSONElement instance. + ''' cdef simdjson_element v = extract_element(element) - return _wrap_element(v) + return JSONElement.from_element(v) -cdef inline object _wrap_element(simdjson_element v): - cdef simdjson_element_type et = v.type() - - if et == OBJECT: - obj = JSONObject() - obj.Object = v.get_object() - return obj - - elif et == ARRAY: - arr = JSONArray() - arr.Array = v.get_array() - return arr - - elif et == STRING: - return element_to_py_string(v) - - elif et == INT64: - return v.get_int64() - - elif et == UINT64: - return v.get_uint64() - - elif et == DOUBLE: - return v.get_double() - - elif et == NULL_VALUE: - return None - - elif et == BOOL: - return v.get_bool() - - else: - raise ValueError("Unknown element type") +def addr_to_element(element_addr: int): + cdef char * e = NULL + e += element_addr + cdef simdjson_element v = extract_element(e) + return JSONElement.from_element(v) cdef inline object _export_object(simdjson_object obj): @@ -386,37 +511,6 @@ cdef inline object _export_array(simdjson_array arr): return result -cdef inline object _export_element(simdjson_element v): - cdef simdjson_element_type et = v.type() - - if et == OBJECT: - return _export_object(v.get_object()) - - elif et == ARRAY: - return _export_array(v.get_array()) - - elif et == STRING: - return element_to_py_string(v) - - elif et == INT64: - return v.get_int64() - - elif et == UINT64: - return v.get_uint64() - - elif et == DOUBLE: - return v.get_double() - - elif et == NULL_VALUE: - return None - - elif et == BOOL: - return v.get_bool() - - else: - raise ValueError("Unknown element type") - - MAXSIZE_BYTES = SIMDJSON_MAXSIZE_BYTES PADDING = SIMDJSON_PADDING diff --git a/cysimdjson/cysimdjsonc.cpp b/cysimdjson/cysimdjsonc.cpp index bca5f96..510957f 100644 --- a/cysimdjson/cysimdjsonc.cpp +++ b/cysimdjson/cysimdjsonc.cpp @@ -22,7 +22,7 @@ void cysimdjson_parser_del(void * p) { } -size_t cysimdjson_element_sizeof(void) { +const size_t cysimdjson_element_sizeof(void) { return sizeof(simdjson::dom::element); } @@ -76,6 +76,7 @@ bool cysimdjson_element_get_str(const char * attrname, size_t attrlen, void * e, } bool cysimdjson_element_get_int64_t(const char * attrname, size_t attrlen, void * e, int64_t * output) { + simdjson::dom::element * element = static_cast(e); std::string_view pointer = std::string_view(attrname, attrlen); @@ -155,10 +156,23 @@ char cysimdjson_element_get_type(const char * attrname, size_t attrlen, void * e return '\0'; } -// This is here for an unit test -void cysimdjson_parser_test() { - printf("cysimdjson_parser_test started ...\n"); +bool cysimdjson_element_get(const char * attrname, size_t attrlen, void * e, void * output_element) { + simdjson::dom::element * element = static_cast(e); + std::string_view pointer = std::string_view(attrname, attrlen); + + simdjson::dom::element * sub_element = new(output_element) simdjson::dom::element(); + + auto err = element->at_pointer(pointer).get(*sub_element); + if (err) { + return true; + } + + return false; + +} +// This is here for an unit test +int cysimdjson_parser_test() { simdjson::dom::parser parser; simdjson::dom::object object; @@ -166,6 +180,9 @@ void cysimdjson_parser_test() { const size_t jsond_len = std::strlen(jsond); auto error = parser.parse(jsond, jsond_len).get(object); - - printf("cysimdjson_parser_test OK!\n"); + if (error) { + return -1; + } + + return 0; } diff --git a/cysimdjson/cysimdjsonc.h b/cysimdjson/cysimdjsonc.h index 489caf9..1597c57 100644 --- a/cysimdjson/cysimdjsonc.h +++ b/cysimdjson/cysimdjsonc.h @@ -11,17 +11,20 @@ void * cysimdjson_parser_new(void); void cysimdjson_parser_del(void * parser); -size_t cysimdjson_element_sizeof(void); -bool cysimdjson_parser_parse(void * parser, void * memory, const uint8_t * data, size_t datalen); +const size_t cysimdjson_element_sizeof(void); + +// `element` is a pointer with pre-allocated buffer of the size=cysimdjson_element_sizeof() +bool cysimdjson_parser_parse(void * parser, void * element, const uint8_t * data, size_t datalen); bool cysimdjson_element_get_str(const char * attrname, size_t attrlen, void * element, char ** output, size_t * outputlen); -bool cysimdjson_element_get_int64_t(const char * attrname, size_t attrlen, void * e, int64_t * output); -bool cysimdjson_element_get_uint64_t(const char * attrname, size_t attrlen, void * e, uint64_t * output); -bool cysimdjson_element_get_bool(const char * attrname, size_t attrlen, void * e, bool * output); -bool cysimdjson_element_get_double(const char * attrname, size_t attrlen, void * e, double * output); +bool cysimdjson_element_get_int64_t(const char * attrname, size_t attrlen, void * element, int64_t * output); +bool cysimdjson_element_get_uint64_t(const char * attrname, size_t attrlen, void * element, uint64_t * output); +bool cysimdjson_element_get_bool(const char * attrname, size_t attrlen, void * element, bool * output); +bool cysimdjson_element_get_double(const char * attrname, size_t attrlen, void * element, double * output); -char cysimdjson_element_get_type(const char * attrname, size_t attrlen, void * e); +char cysimdjson_element_get_type(const char * attrname, size_t attrlen, void * element); +bool cysimdjson_element_get(const char * attrname, size_t attrlen, void * element, void * output_element); -void cysimdjson_parser_test(void); +int cysimdjson_parser_test(void); #endif diff --git a/cysimdjson/jsoninter.h b/cysimdjson/jsoninter.h index 3e773b7..3c36ebd 100644 --- a/cysimdjson/jsoninter.h +++ b/cysimdjson/jsoninter.h @@ -40,3 +40,7 @@ inline dom::element extract_element(void * p) { dom::element * element = static_cast(p); return *element; } + +inline size_t element_addrof(dom::element & element) { + return (size_t)&element; +} diff --git a/perftest/test_benchmark.py b/perftest/test_benchmark.py index 834545d..76bcbf8 100644 --- a/perftest/test_benchmark.py +++ b/perftest/test_benchmark.py @@ -3,6 +3,7 @@ jsonpath = pathlib.Path(__file__).parent / "jsonexamples" + def benchmark(name, what, number): dt = timeit.timeit(what, number=number) return (name, number, dt) @@ -30,17 +31,17 @@ def print_results(jsonfile, results): print("") -def perftest_orjson_parser(jsonfile, number): - import orjson +# def perftest_orjson_parser(jsonfile, number): +# import orjson - with open(jsonfile, 'rb') as f: - jsonb = f.read() +# with open(jsonfile, 'rb') as f: +# jsonb = f.read() - return benchmark( - "orjson loads", - lambda: orjson.loads(jsonb), - number=number - ) +# return benchmark( +# "orjson loads", +# lambda: orjson.loads(jsonb), +# number=number +# ) def perftest_pysimdjson_parser(jsonfile, number): @@ -58,17 +59,17 @@ def perftest_pysimdjson_parser(jsonfile, number): ) -def perftest_libpy_simdjson_parser(jsonfile, number): - import libpy_simdjson +# def perftest_libpy_simdjson_parser(jsonfile, number): +# import libpy_simdjson - with open(jsonfile, 'rb') as f: - jsonb = f.read() +# with open(jsonfile, 'rb') as f: +# jsonb = f.read() - return benchmark( - "libpy_simdjson loads", - lambda: libpy_simdjson.loads(jsonb), - number=number - ) +# return benchmark( +# "libpy_simdjson loads", +# lambda: libpy_simdjson.loads(jsonb), +# number=number +# ) def perftest_pythonjson_loads(jsonfile, number): @@ -113,9 +114,10 @@ def perftest_cysimdjson_pad_parse(jsonfile, number): number=number ) + def main(): test_set = [ - perftest_orjson_parser, + # perftest_orjson_parser, perftest_pysimdjson_parser, # perftest_libpy_simdjson_parser, perftest_pythonjson_loads, diff --git a/test/__init__.py b/test/__init__.py index adc6223..eaf92de 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -1,3 +1,4 @@ from .test_array import * from .test_document import * +from .test_scalar import * from .test_capi import * diff --git a/test/test_capi/__init__.py b/test/test_capi/__init__.py index 1a3e63a..bcddd34 100644 --- a/test/test_capi/__init__.py +++ b/test/test_capi/__init__.py @@ -1,8 +1,11 @@ +import os import ctypes import unittest import cysimdjson +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + class CySIMDJSONCAPITestCases(unittest.TestCase): @@ -10,21 +13,143 @@ class CySIMDJSONCAPITestCases(unittest.TestCase): def setUp(self): self.cysimdjsonapi = ctypes.cdll.LoadLibrary(cysimdjson.__file__) - def test_capi_01(self): - self.cysimdjsonapi.cysimdjson_parser_new.restype = ctypes.c_void_p + self.cysimdjsonapi.cysimdjson_parser_del.argtypes = [ctypes.c_void_p] - parser = self.cysimdjsonapi.cysimdjson_parser_new() + self.cysimdjsonapi.cysimdjson_parser_parse.restype = ctypes.c_bool + self.cysimdjsonapi.cysimdjson_parser_parse.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t] + + self.cysimdjsonapi.cysimdjson_element_get_int64_t.restype = ctypes.c_bool + self.cysimdjsonapi.cysimdjson_element_get_int64_t.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int64)] + + self.cysimdjsonapi.cysimdjson_element_get.restype = ctypes.c_bool + self.cysimdjsonapi.cysimdjson_element_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_void_p] + + def test_capi_01(self): + parser = self.cysimdjsonapi.cysimdjson_parser_new() self.cysimdjsonapi.cysimdjson_parser_del(parser) def test_capi_02(self): - self.cysimdjsonapi.cysimdjson_parser_new.restype = ctypes.c_int element_sizeof = self.cysimdjsonapi.cysimdjson_element_sizeof() - print("element_sizeof:", element_sizeof) + self.assertGreater(element_sizeof, 0) def test_capi_03(self): - self.cysimdjsonapi.cysimdjson_parser_test() + res = self.cysimdjsonapi.cysimdjson_parser_test() + self.assertEqual(res, 0) + + + def test_capi_04(self): + parser = self.cysimdjsonapi.cysimdjson_parser_new() + + element = ctypes.create_string_buffer( + self.cysimdjsonapi.cysimdjson_element_sizeof() + ) + + with open(os.path.join(THIS_DIR, 'test.json'), 'rb') as fin: + json_raw = fin.read() + json_buffer = ctypes.create_string_buffer(json_raw) + + error = self.cysimdjsonapi.cysimdjson_parser_parse( + parser, + element, + json_buffer, + len(json_raw) + ) + self.assertFalse(error) + + jsonpointer = ctypes.create_string_buffer(b"/document/key4") + int64_ptr = ctypes.c_int64() + + error = self.cysimdjsonapi.cysimdjson_element_get_int64_t( + jsonpointer, + len(jsonpointer) - 1, # We don't want terminating '\0' + element, + int64_ptr + ) + self.assertFalse(error) + self.assertEqual(int64_ptr.value, 40) + + self.cysimdjsonapi.cysimdjson_parser_del(parser) + + + def test_capi_05(self): + parser = self.cysimdjsonapi.cysimdjson_parser_new() + + element = ctypes.create_string_buffer( + self.cysimdjsonapi.cysimdjson_element_sizeof() + ) + + with open(os.path.join(THIS_DIR, 'test.json'), 'rb') as fin: + json_raw = fin.read() + json_buffer = ctypes.create_string_buffer(json_raw) + + error = self.cysimdjsonapi.cysimdjson_parser_parse( + parser, + element, + json_buffer, + len(json_raw) + ) + self.assertFalse(error) + + jsonpointer = ctypes.create_string_buffer(b"/document") + + subelement = ctypes.create_string_buffer( + self.cysimdjsonapi.cysimdjson_element_sizeof() + ) + + error = self.cysimdjsonapi.cysimdjson_element_get( + jsonpointer, + len(jsonpointer) - 1, # We don't want terminating '\0' + element, + subelement + ) + self.assertFalse(error) + + + jsonpointer = ctypes.create_string_buffer(b"/key4") + int64_ptr = ctypes.c_int64() + + error = self.cysimdjsonapi.cysimdjson_element_get_int64_t( + jsonpointer, + len(jsonpointer) - 1, # We don't want terminating '\0' + subelement, + int64_ptr + ) + self.assertFalse(error) + self.assertEqual(int64_ptr.value, 40) + + + self.cysimdjsonapi.cysimdjson_parser_del(parser) + + + def test_capi_06(self): + + parser = cysimdjson.JSONParser() + + with open(os.path.join(THIS_DIR, 'test.json'), 'r') as fo: + json_parsed = parser.parse_string(fo.read()) + + # Transition into C API + element_addr = json_parsed.get_addr() + self.assertNotEqual(element_addr, 0) + + jsonpointer = ctypes.create_string_buffer(b"/document/key4") + int64_ptr = ctypes.c_int64() + + error = self.cysimdjsonapi.cysimdjson_element_get_int64_t( + jsonpointer, + len(jsonpointer) - 1, # We don't want terminating '\0' + element_addr, + int64_ptr + ) + self.assertFalse(error) + self.assertEqual(int64_ptr.value, 40) + + # Transition back to Cython API + cython_element = cysimdjson.addr_to_element(element_addr) + val = cython_element.at_pointer("/document/key4") + self.assertEqual(val, 40) diff --git a/test/test_capi/test.json b/test/test_capi/test.json new file mode 100644 index 0000000..b27864b --- /dev/null +++ b/test/test_capi/test.json @@ -0,0 +1,9 @@ +{ + "document": { + "key1": 1, + "key2": "2", + "key3": "3", + "key4": 40, + "key5": "50" + } +} diff --git a/test/test_document/__init__.py b/test/test_document/__init__.py index 1d426a7..4bc9b88 100644 --- a/test/test_document/__init__.py +++ b/test/test_document/__init__.py @@ -91,7 +91,7 @@ def test_parser_resut(self): ) - def test_gey_01(self): + def test_get_01(self): parser = cysimdjson.JSONParser() @@ -115,3 +115,13 @@ def test_gey_01(self): v3 = json_parsed.get('not-present') self.assertEqual(v3, None) + + + def test_loads_01(self): + + parser = cysimdjson.JSONParser() + json_parsed = parser.loads('''{"foo":"bar"}''') + + self.assertEqual(json_parsed['foo'], 'bar') + + diff --git a/test/test_scalar/__init__.py b/test/test_scalar/__init__.py new file mode 100644 index 0000000..aa8741e --- /dev/null +++ b/test/test_scalar/__init__.py @@ -0,0 +1,26 @@ +import unittest +import os + +import cysimdjson + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class JSONScalarTestCases(unittest.TestCase): + + def test_scalar_01(self): + + parser = cysimdjson.JSONParser() + + with open(os.path.join(THIS_DIR, 'scalar_01.json'), 'rb') as fo: + json_parsed = parser.parse(fo.read()) + + self.assertEqual(json_parsed.get_value(), 1) + + + def test_scalar_02(self): + + parser = cysimdjson.JSONParser() + + json_loaded = parser.load(os.path.join(THIS_DIR, 'scalar_01.json')) + self.assertEqual(json_loaded, 1) diff --git a/test/test_scalar/scalar_01.json b/test/test_scalar/scalar_01.json new file mode 100644 index 0000000..d00491f --- /dev/null +++ b/test/test_scalar/scalar_01.json @@ -0,0 +1 @@ +1