diff --git a/autotest/generate_parquet_test_file.py b/autotest/generate_parquet_test_file.py index 84beadc8e595..6f8543aabe22 100644 --- a/autotest/generate_parquet_test_file.py +++ b/autotest/generate_parquet_test_file.py @@ -615,7 +615,257 @@ def generate_parquet_wkt_with_dict(): ) +def generate_nested_types(): + import pathlib + + import pyarrow as pa + import pyarrow.parquet as pq + + map_list_bool = pa.array( + [ + [("x", [True]), ("y", [False, True])], + [("z", [])], + None, + [("w", [True, False])], + [], + ], + type=pa.map_(pa.string(), pa.list_(pa.bool_())), + ) + + map_list_uint8 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.uint8())), + ) + + map_list_int8 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.int8())), + ) + + map_list_uint16 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.uint16())), + ) + + map_list_int16 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.int16())), + ) + + map_list_uint32 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.uint32())), + ) + + map_list_int32 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.int32())), + ) + + map_list_uint64 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.uint64())), + ) + + map_list_int64 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.int64())), + ) + + map_list_float32 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.float32())), + ) + + map_list_float64 = pa.array( + [[("x", [2]), ("y", [3, 4])], [("z", [])], None, [("w", [5, 6])], []], + type=pa.map_(pa.string(), pa.list_(pa.float64())), + ) + + map_map_bool = pa.array( + [ + [("a", [("b", True), ("c", None), ("d", None)]), ("e", None)], + None, + [("f", [("g", False)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.bool_())), + ) + + map_map_uint8 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint8())), + ) + + map_map_int8 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int8())), + ) + + map_map_uint16 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint16())), + ) + + map_map_int16 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int16())), + ) + + map_map_uint32 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint32())), + ) + + map_map_int32 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int32())), + ) + + map_map_uint64 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.uint64())), + ) + + map_map_int64 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.int64())), + ) + + map_map_float32 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.float32())), + ) + + map_map_float64 = pa.array( + [ + [("a", [("b", 1), ("c", None), ("d", 2)]), ("e", None)], + None, + [("f", [("g", 3)])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.float64())), + ) + + map_map_string = pa.array( + [ + [("a", [("b", "c"), ("d", None)]), ("e", None)], + None, + [("f", [("g", "h")])], + None, + None, + ], + type=pa.map_(pa.string(), pa.map_(pa.string(), pa.string())), + ) + + list_list_string = pa.array( + [[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []], + type=pa.list_(pa.list_(pa.string())), + ) + + list_map_string = pa.array( + [[[("a", "b"), ("c", "d")], [("e", "f")]], None, [], [], []], + type=pa.list_(pa.map_(pa.string(), pa.string())), + ) + + names = [ + "map_list_bool", + "map_list_uint8", + "map_list_int8", + "map_list_uint16", + "map_list_int16", + "map_list_uint32", + "map_list_int32", + "map_list_uint64", + "map_list_int64", + "map_list_float32", + "map_list_float64", + "map_map_bool", + "map_map_uint8", + "map_map_int8", + "map_map_uint16", + "map_map_int16", + "map_map_uint32", + "map_map_int32", + "map_map_uint64", + "map_map_int64", + "map_map_float32", + "map_map_float64", + "map_map_string", + "list_list_string", + "list_map_string", + ] + + locals_ = locals() + table = pa.table([locals_[x] for x in names], names=names) + + HERE = pathlib.Path(__file__).parent + pq.write_table( + table, + HERE / "ogr/data/parquet/nested_types.parquet", + compression="NONE", + row_group_size=3, + ) + + if __name__ == "__main__": generate_test_parquet() generate_all_geoms_parquet() generate_parquet_wkt_with_dict() + generate_nested_types() diff --git a/autotest/ogr/data/parquet/nested_types.parquet b/autotest/ogr/data/parquet/nested_types.parquet new file mode 100644 index 000000000000..e178d2ec7fe4 Binary files /dev/null and b/autotest/ogr/data/parquet/nested_types.parquet differ diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 747bff643c14..990a5d9be5d5 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -2374,3 +2374,148 @@ def test_ogr_parquet_read_dataset_with_empty_batch(): # Check that we don't iterate forever lyr.GetExtent() assert len([f for f in lyr]) == 1 + + +############################################################################### +# Read nested types that we map to JSON + + +def test_ogr_parquet_nested_types(): + + # File generated by autotest/generate_parquet_test_file.py::generate_nested_types() + ds = ogr.Open("data/parquet/nested_types.parquet") + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + assert f["map_list_bool"] == """{"x":[true],"y":[false,true]}""" + assert f["map_list_uint8"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_int8"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_uint16"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_int16"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_uint32"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_int32"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_uint64"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_int64"] == """{"x":[2],"y":[3,4]}""" + assert f["map_list_float32"] == """{"x":[2.0],"y":[3.0,4.0]}""" + assert f["map_list_float64"] == """{"x":[2.0],"y":[3.0,4.0]}""" + assert f["map_map_bool"] == """{"a":{"b":true,"c":null,"d":null},"e":null}""" + assert f["map_map_uint8"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_int8"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_uint16"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_int16"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_uint32"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_int32"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_uint64"] == """{"a":{"b":1.0,"c":null,"d":2.0},"e":null}""" + assert f["map_map_int64"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" + assert f["map_map_float32"] == """{"a":{"b":1.0,"c":null,"d":2.0},"e":null}""" + assert f["map_map_float64"] == """{"a":{"b":1.0,"c":null,"d":2.0},"e":null}""" + assert f["map_map_string"] == """{"a":{"b":"c","d":null},"e":null}""" + assert f["list_list_string"] == """[["a"],null,["b",null,"cd"]]""" + assert f["list_map_string"] == """[{"a":"b","c":"d"},{"e":"f"}]""" + + f = lyr.GetNextFeature() + assert f["map_list_bool"] == """{"z":[]}""" + assert f["map_list_uint8"] == """{"z":[]}""" + assert f["map_list_int8"] == """{"z":[]}""" + assert f["map_list_uint16"] == """{"z":[]}""" + assert f["map_list_int16"] == """{"z":[]}""" + assert f["map_list_uint32"] == """{"z":[]}""" + assert f["map_list_int32"] == """{"z":[]}""" + assert f["map_list_uint64"] == """{"z":[]}""" + assert f["map_list_int64"] == """{"z":[]}""" + assert f["map_list_float32"] == """{"z":[]}""" + assert f["map_list_float64"] == """{"z":[]}""" + assert f["map_map_bool"] is None + assert f["map_map_uint8"] is None + assert f["map_map_int8"] is None + assert f["map_map_uint16"] is None + assert f["map_map_int16"] is None + assert f["map_map_uint32"] is None + assert f["map_map_int32"] is None + assert f["map_map_uint64"] is None + assert f["map_map_int64"] is None + assert f["map_map_float32"] is None + assert f["map_map_float64"] is None + assert f["map_map_string"] is None + assert f["list_list_string"] is None + assert f["list_map_string"] is None + + f = lyr.GetNextFeature() + assert f["map_list_bool"] is None + assert f["map_list_uint8"] is None + assert f["map_list_int8"] is None + assert f["map_list_uint16"] is None + assert f["map_list_int16"] is None + assert f["map_list_uint32"] is None + assert f["map_list_int32"] is None + assert f["map_list_uint64"] is None + assert f["map_list_int64"] is None + assert f["map_list_float32"] is None + assert f["map_list_float64"] is None + assert f["map_map_bool"] == """{"f":{"g":false}}""" + assert f["map_map_uint8"] == """{"f":{"g":3}}""" + assert f["map_map_int8"] == """{"f":{"g":3}}""" + assert f["map_map_uint16"] == """{"f":{"g":3}}""" + assert f["map_map_int16"] == """{"f":{"g":3}}""" + assert f["map_map_uint32"] == """{"f":{"g":3}}""" + assert f["map_map_int32"] == """{"f":{"g":3}}""" + assert f["map_map_uint64"] == """{"f":{"g":3.0}}""" + assert f["map_map_int64"] == """{"f":{"g":3}}""" + assert f["map_map_float32"] == """{"f":{"g":3.0}}""" + assert f["map_map_float64"] == """{"f":{"g":3.0}}""" + assert f["map_map_string"] == """{"f":{"g":"h"}}""" + assert f["list_list_string"] == """[["efg"]]""" + assert f["list_map_string"] == """[]""" + + f = lyr.GetNextFeature() + assert f["map_list_bool"] == """{"w":[true,false]}""" + assert f["map_list_uint8"] == """{"w":[5,6]}""" + assert f["map_list_int8"] == """{"w":[5,6]}""" + assert f["map_list_uint16"] == """{"w":[5,6]}""" + assert f["map_list_int16"] == """{"w":[5,6]}""" + assert f["map_list_uint32"] == """{"w":[5,6]}""" + assert f["map_list_int32"] == """{"w":[5,6]}""" + assert f["map_list_uint64"] == """{"w":[5,6]}""" + assert f["map_list_int64"] == """{"w":[5,6]}""" + assert f["map_list_float32"] == """{"w":[5.0,6.0]}""" + assert f["map_list_float64"] == """{"w":[5.0,6.0]}""" + assert f["map_map_bool"] is None + assert f["map_map_uint8"] is None + assert f["map_map_int8"] is None + assert f["map_map_uint16"] is None + assert f["map_map_int16"] is None + assert f["map_map_uint32"] is None + assert f["map_map_int32"] is None + assert f["map_map_uint64"] is None + assert f["map_map_int64"] is None + assert f["map_map_float32"] is None + assert f["map_map_float64"] is None + assert f["map_map_string"] is None + assert f["list_list_string"] == """[]""" + assert f["list_map_string"] == """[]""" + + f = lyr.GetNextFeature() + assert f["map_list_bool"] == """{}""" + assert f["map_list_uint8"] == """{}""" + assert f["map_list_int8"] == """{}""" + assert f["map_list_uint16"] == """{}""" + assert f["map_list_int16"] == """{}""" + assert f["map_list_uint32"] == """{}""" + assert f["map_list_int32"] == """{}""" + assert f["map_list_uint64"] == """{}""" + assert f["map_list_int64"] == """{}""" + assert f["map_list_float32"] == """{}""" + assert f["map_list_float64"] == """{}""" + assert f["map_map_bool"] is None + assert f["map_map_uint8"] is None + assert f["map_map_int8"] is None + assert f["map_map_uint16"] is None + assert f["map_map_int16"] is None + assert f["map_map_uint32"] is None + assert f["map_map_int32"] is None + assert f["map_map_uint64"] is None + assert f["map_map_int64"] is None + assert f["map_map_float32"] is None + assert f["map_map_float64"] is None + assert f["map_map_string"] is None + assert f["list_list_string"] == """[]""" + assert f["list_map_string"] == """[]""" diff --git a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h index 3288f9e1e5b0..1d5e1a120285 100644 --- a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h +++ b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h @@ -139,6 +139,12 @@ class OGRArrowLayer CPL_NON_FINAL virtual std::string GetDriverUCName() const = 0; static bool IsIntegerArrowType(arrow::Type::type typeId); static bool + IsHandledListOrMapType(const std::shared_ptr &valueType); + static bool + IsHandledListType(const std::shared_ptr &listType); + static bool + IsHandledMapType(const std::shared_ptr &mapType); + static bool IsValidGeometryEncoding(const std::shared_ptr &field, const std::string &osEncoding, OGRwkbGeometryType &eGeomTypeOut, diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index 794d9808af6a..3aa266ab7957 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -170,6 +170,48 @@ inline bool OGRArrowLayer::IsIntegerArrowType(arrow::Type::type typeId) typeId == arrow::Type::INT64 || typeId == arrow::Type::UINT64; } +/************************************************************************/ +/* IsHandledListOrMapType() */ +/************************************************************************/ + +inline bool OGRArrowLayer::IsHandledListOrMapType( + const std::shared_ptr &valueType) +{ + const auto itemTypeId = valueType->id(); + return itemTypeId == arrow::Type::BOOL || IsIntegerArrowType(itemTypeId) || + itemTypeId == arrow::Type::HALF_FLOAT || + itemTypeId == arrow::Type::FLOAT || + itemTypeId == arrow::Type::DOUBLE || + itemTypeId == arrow::Type::STRING || + (itemTypeId == arrow::Type::MAP && + IsHandledMapType( + std::static_pointer_cast(valueType))) || + (itemTypeId == arrow::Type::LIST && + IsHandledListType( + std::static_pointer_cast(valueType))); +} + +/************************************************************************/ +/* IsHandledListType() */ +/************************************************************************/ + +inline bool OGRArrowLayer::IsHandledListType( + const std::shared_ptr &listType) +{ + return IsHandledListOrMapType(listType->value_type()); +} + +/************************************************************************/ +/* IsHandledMapType() */ +/************************************************************************/ + +inline bool +OGRArrowLayer::IsHandledMapType(const std::shared_ptr &mapType) +{ + return mapType->key_type()->id() == arrow::Type::STRING && + IsHandledListOrMapType(mapType->item_type()); +} + /************************************************************************/ /* MapArrowTypeToOGR() */ /************************************************************************/ @@ -304,11 +346,22 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( eType = OFTStringList; break; default: - bTypeOK = false; - CPLError(CE_Warning, CPLE_AppDefined, - "Field %s of unhandled type %s ignored", - field->name().c_str(), type->ToString().c_str()); + { + if (IsHandledListType(listType)) + { + eType = OFTString; + eSubType = OFSTJSON; + } + else + { + bTypeOK = false; + CPLError(CE_Warning, CPLE_AppDefined, + "Field %s of unhandled type %s ignored", + field->name().c_str(), + type->ToString().c_str()); + } break; + } } break; } @@ -316,14 +369,7 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( case arrow::Type::MAP: { auto mapType = std::static_pointer_cast(type); - const auto itemTypeId = mapType->item_type()->id(); - if (mapType->key_type()->id() == arrow::Type::STRING && - (itemTypeId == arrow::Type::BOOL || - IsIntegerArrowType(itemTypeId) || - itemTypeId == arrow::Type::HALF_FLOAT || - itemTypeId == arrow::Type::FLOAT || - itemTypeId == arrow::Type::DOUBLE || - itemTypeId == arrow::Type::STRING)) + if (IsHandledMapType(mapType)) { eType = OFTString; eSubType = OFSTJSON; @@ -842,13 +888,158 @@ OGRArrowLayer::GetGeometryTypeFromString(const std::string &osType) /* ReadList() */ /************************************************************************/ +static CPLJSONObject ReadMap(const arrow::MapArray *array, int64_t nIdxInArray); + template -static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInBatch, +static CPLJSONArray ReadList(const ArrayType *array, int64_t nIdxInArray) +{ + const auto values = std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + CPLJSONArray oArray; + for (int k = 0; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + oArray.AddNull(); + else + oArray.Add(static_cast(values->Value(nIdxStart + k))); + } + return oArray; +} + +template +static CPLJSONArray ReadList(const ArrayType *array, int64_t nIdxInArray) +{ + switch (array->value_type()->id()) + { + case arrow::Type::BOOL: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::UINT8: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::INT8: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::UINT16: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::INT16: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::INT32: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::UINT32: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::INT64: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::UINT64: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::HALF_FLOAT: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::FLOAT: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::DOUBLE: + { + return ReadList(array, nIdxInArray); + } + case arrow::Type::STRING: + { + CPLJSONArray oArray; + const auto values = + std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + for (int k = 0; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + { + oArray.AddNull(); + } + else + { + oArray.Add(values->GetString(nIdxStart + k)); + } + } + return oArray; + } + + case arrow::Type::LIST: + { + CPLJSONArray oArray; + const auto values = + std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + for (int k = 0; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + { + oArray.AddNull(); + } + else + { + oArray.Add(ReadList(values.get(), nIdxStart + k)); + } + } + return oArray; + } + + case arrow::Type::MAP: + { + CPLJSONArray oArray; + const auto values = + std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + for (int k = 0; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + { + oArray.AddNull(); + } + else + { + oArray.Add(ReadMap(values.get(), nIdxStart + k)); + } + } + return oArray; + } + + default: + { + CPLDebug("ARROW", "ReadList(): unexpected data type %s", + array->values()->type()->ToString().c_str()); + break; + } + } + return CPLJSONArray(); +} + +template +static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, const ArrayType *array) { const auto values = std::static_pointer_cast(array->values()); - const auto nIdxStart = array->value_offset(nIdxInBatch); - const int nCount = array->value_length(nIdxInBatch); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); std::vector aValues; aValues.reserve(nCount); for (int k = 0; k < nCount; k++) @@ -859,13 +1050,13 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInBatch, } template -static void ReadListDouble(OGRFeature *poFeature, int i, int64_t nIdxInBatch, +static void ReadListDouble(OGRFeature *poFeature, int i, int64_t nIdxInArray, const ArrayType *array) { const auto values = std::static_pointer_cast(array->values()); const auto rawValues = values->raw_values(); - const auto nIdxStart = array->value_offset(nIdxInBatch); - const int nCount = array->value_length(nIdxInBatch); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); std::vector aValues; aValues.reserve(nCount); for (int k = 0; k < nCount; k++) @@ -879,74 +1070,74 @@ static void ReadListDouble(OGRFeature *poFeature, int i, int64_t nIdxInBatch, } template -static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInBatch, +static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, const ArrayType *array, arrow::Type::type valueTypeId) { switch (valueTypeId) { case arrow::Type::BOOL: { - ReadList(poFeature, i, nIdxInBatch, + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::UINT8: { - ReadList(poFeature, i, nIdxInBatch, array); + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::INT8: { - ReadList(poFeature, i, nIdxInBatch, array); + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::UINT16: { - ReadList(poFeature, i, nIdxInBatch, array); + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::INT16: { - ReadList(poFeature, i, nIdxInBatch, array); + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::INT32: { - ReadList(poFeature, i, nIdxInBatch, array); + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::UINT32: { - ReadList(poFeature, i, nIdxInBatch, + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::INT64: { - ReadList(poFeature, i, nIdxInBatch, + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::UINT64: { - ReadList(poFeature, i, nIdxInBatch, + ReadList(poFeature, i, nIdxInArray, array); break; } case arrow::Type::HALF_FLOAT: { - ReadListDouble(poFeature, i, nIdxInBatch, + ReadListDouble(poFeature, i, nIdxInArray, array); break; } case arrow::Type::FLOAT: { - ReadListDouble(poFeature, i, nIdxInBatch, array); + ReadListDouble(poFeature, i, nIdxInArray, array); break; } case arrow::Type::DOUBLE: { - ReadListDouble(poFeature, i, nIdxInBatch, + ReadListDouble(poFeature, i, nIdxInArray, array); break; } @@ -954,8 +1145,8 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInBatch, { const auto values = std::static_pointer_cast(array->values()); - const auto nIdxStart = array->value_offset(nIdxInBatch); - const int nCount = array->value_length(nIdxInBatch); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); CPLStringList aosList; for (int k = 0; k < nCount; k++) { @@ -969,8 +1160,22 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInBatch, break; } + case arrow::Type::LIST: + case arrow::Type::MAP: + { + poFeature->SetField(i, + ReadList(array, nIdxInArray) + .Format(CPLJSONObject::PrettyFormat::Plain) + .c_str()); + break; + } + default: + { + CPLDebug("ARROW", "ReadList(): unexpected data type %s", + array->values()->type()->ToString().c_str()); break; + } } } @@ -979,14 +1184,13 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInBatch, /************************************************************************/ template -static void ReadMap(OGRFeature *poFeature, int i, int64_t nIdxInBatch, - const arrow::MapArray *array) +static CPLJSONObject ReadMap(const arrow::MapArray *array, int64_t nIdxInArray) { const auto keys = std::static_pointer_cast(array->keys()); const auto values = std::static_pointer_cast(array->items()); - const auto nIdxStart = array->value_offset(nIdxInBatch); - const int nCount = array->value_length(nIdxInBatch); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); CPLJSONObject oRoot; for (int k = 0; k < nCount; k++) { @@ -1000,8 +1204,133 @@ static void ReadMap(OGRFeature *poFeature, int i, int64_t nIdxInBatch, oRoot.AddNull(osKey); } } - poFeature->SetField( - i, oRoot.Format(CPLJSONObject::PrettyFormat::Plain).c_str()); + return oRoot; +} + +static CPLJSONObject ReadMap(const arrow::MapArray *array, int64_t nIdxInArray) +{ + const auto mapType = + static_cast(array->data()->type.get()); + const auto itemTypeId = mapType->item_type()->id(); + if (mapType->key_type()->id() == arrow::Type::STRING) + { + if (itemTypeId == arrow::Type::BOOL) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::UINT8) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::INT8) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::UINT16) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::INT16) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::UINT32) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::INT32) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::UINT64) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::INT64) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::FLOAT) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::DOUBLE) + { + return ReadMap(array, nIdxInArray); + } + else if (itemTypeId == arrow::Type::STRING) + { + const auto keys = + std::static_pointer_cast(array->keys()); + const auto values = + std::static_pointer_cast(array->items()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + CPLJSONObject oRoot; + for (int k = 0; k < nCount; k++) + { + if (!keys->IsNull(nIdxStart + k)) + { + const auto osKey = keys->GetString(nIdxStart + k); + if (!values->IsNull(nIdxStart + k)) + oRoot.Add(osKey, values->GetString(nIdxStart + k)); + else + oRoot.AddNull(osKey); + } + } + return oRoot; + } + else if (itemTypeId == arrow::Type::LIST) + { + const auto keys = + std::static_pointer_cast(array->keys()); + const auto values = + std::static_pointer_cast(array->items()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + CPLJSONObject oRoot; + for (int k = 0; k < nCount; k++) + { + if (!keys->IsNull(nIdxStart + k)) + { + const auto osKey = keys->GetString(nIdxStart + k); + if (!values->IsNull(nIdxStart + k)) + oRoot.Add(osKey, ReadList(values.get(), nIdxStart + k)); + else + oRoot.AddNull(osKey); + } + } + return oRoot; + } + else if (itemTypeId == arrow::Type::MAP) + { + const auto keys = + std::static_pointer_cast(array->keys()); + const auto values = + std::static_pointer_cast(array->items()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + CPLJSONObject oRoot; + for (int k = 0; k < nCount; k++) + { + if (!keys->IsNull(nIdxStart + k)) + { + const auto osKey = keys->GetString(nIdxStart + k); + if (!values->IsNull(nIdxStart + k)) + oRoot.Add(osKey, ReadMap(values.get(), nIdxStart + k)); + else + oRoot.AddNull(osKey); + } + } + return oRoot; + } + else + { + CPLDebug("ARROW", "ReadMap(): unexpected data type %s", + array->items()->type()->ToString().c_str()); + } + } + return CPLJSONObject(); } /************************************************************************/ @@ -1502,97 +1831,10 @@ inline OGRFeature *OGRArrowLayer::ReadFeature( { const auto castArray = static_cast(array); - const auto mapType = static_cast( - array->data()->type.get()); - const auto itemTypeId = mapType->item_type()->id(); - if (mapType->key_type()->id() == arrow::Type::STRING) - { - if (itemTypeId == arrow::Type::BOOL) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::UINT8) - { - ReadMap(poFeature, i, - nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::INT8) - { - ReadMap(poFeature, i, - nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::UINT16) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::INT16) - { - ReadMap(poFeature, i, - nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::UINT32) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::INT32) - { - ReadMap(poFeature, i, - nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::UINT64) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::INT64) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::FLOAT) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::DOUBLE) - { - ReadMap( - poFeature, i, nIdxInBatch, castArray); - } - else if (itemTypeId == arrow::Type::STRING) - { - const auto keys = - std::static_pointer_cast( - castArray->keys()); - const auto values = - std::static_pointer_cast( - castArray->items()); - const auto nIdxStart = - castArray->value_offset(nIdxInBatch); - const int nCount = castArray->value_length(nIdxInBatch); - CPLJSONDocument oDoc; - auto oRoot = oDoc.GetRoot(); - for (int k = 0; k < nCount; k++) - { - if (!keys->IsNull(nIdxStart + k)) - { - const auto osKey = - keys->GetString(nIdxStart + k); - if (!values->IsNull(nIdxStart + k)) - oRoot.Add(osKey, - values->GetString(nIdxStart + k)); - else - oRoot.AddNull(osKey); - } - } - poFeature->SetField( - i, oRoot.Format(CPLJSONObject::PrettyFormat::Plain) - .c_str()); - } - } + poFeature->SetField( + i, ReadMap(castArray, nIdxInBatch) + .Format(CPLJSONObject::PrettyFormat::Plain) + .c_str()); break; }