Skip to content

Commit 327eaf4

Browse files
committed
Develop JsonSectionPageMapper in Rust API (openvinotoolkit#1224)
- Ticket no. 127135 and 127136. - Develop `JsonSectionPageMapper` to construct page maps for top-level sections in a given JSON file. - Enhance `DatumaroImporter.detect()`'s performance by replacing JSON file parsing logic with the `JsonSectionPageMapper`. Our existing test will validate its functionality. For the performance comparison, please see the following. - Before ```python from datumaro.rust_api import JsonSectionPageMapper from time import time import datumaro as dm start = time() format = dm.Dataset.detect("ws_test/coco/datumaro") dt = 1000.0 * (time() - start) print(f"Duration for detecting Datumaro data format: {dt:.1f}ms, format={format}") ``` ```console Duration for detecting Datumaro data format: 25784.5ms, format=datumaro ``` - After ```python from datumaro.rust_api import JsonSectionPageMapper from time import time import datumaro as dm start = time() format = dm.Dataset.detect("ws_test/coco/datumaro") dt = 1000.0 * (time() - start) print(f"Duration for detecting Datumaro data format: {dt:.1f}ms, format={format}") ``` ```console Duration for detecting Datumaro data format: 17234.7ms, format=datumaro ``` It saves ~7 secs. <!-- Put an 'x' in all the boxes that apply --> - [ ] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python ``` --------- Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
1 parent 375d184 commit 327eaf4

File tree

3 files changed

+241
-4
lines changed

3 files changed

+241
-4
lines changed

rust/src/json_section_page_mapper.rs

+232
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
//
3+
// SPDX-License-Identifier: MIT
4+
5+
use crate::{
6+
page_mapper::{JsonPageMapper, ParsedJsonSection},
7+
utils::read_skipping_ws,
8+
};
9+
use pyo3::{prelude::*, types::PyDict};
10+
use std::{
11+
collections::HashMap,
12+
fs::File,
13+
io::{self, BufReader, Read, Seek},
14+
path::Path,
15+
};
16+
17+
#[derive(Debug)]
18+
struct JsonSection {
19+
key: String,
20+
offset: usize,
21+
size: usize,
22+
}
23+
24+
fn handle_arr_or_dict(
25+
mut stack: Vec<u8>,
26+
mut reader: impl Read + Seek,
27+
mut last_token: u8,
28+
) -> Result<(), io::Error> {
29+
while stack.len() != 0 {
30+
match read_skipping_ws(&mut reader) {
31+
Ok(c) => match c {
32+
b'{' | b'[' => {
33+
stack.push(c);
34+
last_token = c;
35+
}
36+
b'}' => {
37+
if last_token != b'{' {
38+
let cur_pos = reader.stream_position()?;
39+
let msg = format!("Last token in the stack is '{}', but the given token at offset={} is '}}'", last_token as char, cur_pos);
40+
return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
41+
}
42+
stack.pop();
43+
if stack.len() != 0 {
44+
last_token = *stack
45+
.last()
46+
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
47+
}
48+
}
49+
b']' => {
50+
if last_token != b'[' {
51+
let cur_pos = reader.stream_position()?;
52+
let msg = format!("Last token in the stack is '{}', but the given token at offset={} is ']'", last_token as char, cur_pos);
53+
return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
54+
}
55+
stack.pop();
56+
if stack.len() != 0 {
57+
last_token = *stack
58+
.last()
59+
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
60+
}
61+
}
62+
b'"' => {
63+
while let Ok(c) = read_skipping_ws(&mut reader) {
64+
if c == b'"' {
65+
break;
66+
}
67+
}
68+
}
69+
_ => {}
70+
},
71+
Err(err) => {
72+
return Err(err);
73+
}
74+
}
75+
}
76+
Ok(())
77+
}
78+
79+
fn handle_string(mut reader: impl Read + Seek) -> Result<(), io::Error> {
80+
while let Ok(c) = read_skipping_ws(&mut reader) {
81+
if c == b'"' {
82+
break;
83+
}
84+
}
85+
Ok(())
86+
}
87+
88+
fn get_offset(mut reader: impl Read + Seek, stack: &mut Vec<u8>) -> Result<usize, io::Error> {
89+
let mut offset = usize::MAX;
90+
while let Ok(c) = read_skipping_ws(&mut reader) {
91+
stack.push(c);
92+
match c {
93+
b'{' | b'[' | b'"' => {
94+
return Ok(reader.stream_position()? as usize - 1);
95+
}
96+
b',' => {
97+
return Ok(offset - 1);
98+
}
99+
_ => {
100+
let pos = reader.stream_position()? as usize;
101+
offset = std::cmp::min(pos, offset);
102+
}
103+
}
104+
}
105+
Err(io::Error::new(
106+
io::ErrorKind::InvalidData,
107+
"Cannot get offset",
108+
))
109+
}
110+
111+
impl ParsedJsonSection for JsonSection {
112+
fn parse(buf_key: String, mut reader: impl Read + Seek) -> Result<Box<JsonSection>, io::Error> {
113+
// Move reader's cursor right after ':'
114+
while let Ok(c) = read_skipping_ws(&mut reader) {
115+
if c == b':' {
116+
break;
117+
}
118+
}
119+
120+
let mut stack = vec![];
121+
122+
let start_offset = get_offset(&mut reader, &mut stack)?;
123+
124+
let last_token = *stack
125+
.last()
126+
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
127+
128+
let end_offset = match last_token {
129+
b'[' | b'{' => {
130+
let _ = handle_arr_or_dict(stack, &mut reader, last_token)?;
131+
Ok(reader.stream_position()? as usize)
132+
}
133+
b'"' => {
134+
let _ = handle_string(&mut reader)?;
135+
Ok(reader.stream_position()? as usize)
136+
}
137+
b',' => Ok(reader.stream_position()? as usize - 1),
138+
_ => Err(io::Error::new(io::ErrorKind::InvalidData, "s")),
139+
}?;
140+
141+
let size = end_offset - start_offset;
142+
143+
Ok(Box::new(JsonSection {
144+
key: buf_key,
145+
offset: start_offset,
146+
size: size,
147+
}))
148+
}
149+
}
150+
151+
#[derive(Debug)]
152+
pub struct JsonSectionPageMapperImpl {
153+
sections: Vec<Box<JsonSection>>,
154+
}
155+
156+
impl JsonPageMapper<JsonSection> for JsonSectionPageMapperImpl {}
157+
158+
impl JsonSectionPageMapperImpl {
159+
pub fn new(mut reader: impl Read + Seek) -> Result<Self, io::Error> {
160+
let sections = Self::parse_json(&mut reader)?;
161+
162+
Ok(JsonSectionPageMapperImpl { sections: sections })
163+
}
164+
}
165+
166+
#[pyclass]
167+
pub struct JsonSectionPageMapper {
168+
reader: BufReader<File>,
169+
mapper: JsonSectionPageMapperImpl,
170+
}
171+
172+
#[pymethods]
173+
impl JsonSectionPageMapper {
174+
#[new]
175+
fn py_new(path: String) -> PyResult<Self> {
176+
let file = File::open(Path::new(&path))?;
177+
let mut reader = BufReader::new(file);
178+
let mapper = JsonSectionPageMapperImpl::new(&mut reader)?;
179+
180+
Ok(JsonSectionPageMapper { reader, mapper })
181+
}
182+
183+
fn sections(self_: PyRef<Self>) -> PyResult<PyObject> {
184+
let dict: HashMap<&str, HashMap<&str, usize>> = self_
185+
.mapper
186+
.sections
187+
.iter()
188+
.map(|section| {
189+
let nested_dict: HashMap<&str, usize> =
190+
HashMap::from_iter([("offset", section.offset), ("size", section.size)]);
191+
(section.key.as_str(), nested_dict)
192+
})
193+
.collect();
194+
195+
Ok(dict.into_py(self_.py()))
196+
}
197+
198+
fn __len__(&self) -> PyResult<usize> {
199+
Ok(self.mapper.sections.len())
200+
}
201+
}
202+
203+
#[cfg(test)]
204+
mod tests {
205+
use super::*;
206+
use crate::test_helpers::prepare_reader;
207+
208+
#[test]
209+
fn test_instance() {
210+
const EXAMPLE: &str = r#"{"dm_format_version": "1.0", "media_type": 2, "infos": {"string": "test", "int": 0, "float": 0.0, "string_list": ["test0", "test1", "test2"], "int_list": [0, 1, 2], "float_list": [0.0, 0.1, 0.2]}, "categories": {"label": {"labels": [{"name": "cat0", "parent": "", "attributes": ["x", "y"]}, {"name": "cat1", "parent": "", "attributes": ["x", "y"]}, {"name": "cat2", "parent": "", "attributes": ["x", "y"]}, {"name": "cat3", "parent": "", "attributes": ["x", "y"]}, {"name": "cat4", "parent": "", "attributes": ["x", "y"]}], "label_groups": [], "attributes": ["a", "b", "score"]}, "mask": {"colormap": [{"label_id": 0, "r": 0, "g": 0, "b": 0}, {"label_id": 1, "r": 128, "g": 0, "b": 0}, {"label_id": 2, "r": 0, "g": 128, "b": 0}, {"label_id": 3, "r": 128, "g": 128, "b": 0}, {"label_id": 4, "r": 0, "g": 0, "b": 128}]}, "points": {"items": [{"label_id": 0, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 1, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 2, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 3, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 4, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}]}}, "items": [{"id": "42", "annotations": [{"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}, {"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}], "image": {"path": "42.jpg", "size": [10, 6]}}, {"id": "43", "annotations": [], "image": {"path": "43.qq", "size": [2, 4]}}]}
211+
"#;
212+
213+
let (tempfile, mut reader) = prepare_reader(EXAMPLE);
214+
let json_section_page_mapper = JsonSectionPageMapperImpl::new(&mut reader).unwrap();
215+
216+
println!("{:?}", json_section_page_mapper);
217+
218+
for section in json_section_page_mapper.sections {
219+
let offset = section.offset;
220+
let size = section.size;
221+
reader.seek(io::SeekFrom::Start(offset as u64));
222+
let mut buf = vec![0; size];
223+
reader.read(buf.as_mut_slice());
224+
225+
let content: serde_json::Value = serde_json::from_str(
226+
std::str::from_utf8(buf.as_slice()).expect("Cannot change to utf8"),
227+
)
228+
.unwrap();
229+
println!("Section: {}, Content: {:?}", section.key, content);
230+
}
231+
}
232+
}

rust/src/lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
mod coco_page_mapper;
66
mod datum_page_mapper;
7+
mod json_section_page_mapper;
78
mod page_mapper;
89
mod page_maps;
910
mod test_helpers;
@@ -12,13 +13,15 @@ use pyo3::prelude::*;
1213

1314
use crate::coco_page_mapper::CocoPageMapper;
1415
use crate::datum_page_mapper::DatumPageMapper;
16+
use crate::json_section_page_mapper::JsonSectionPageMapper;
1517

1618
/// Datumaro Rust API
1719
#[pymodule]
1820
#[pyo3(name = "rust_api")]
1921
fn rust_api(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
2022
m.add_class::<CocoPageMapper>()?;
2123
m.add_class::<DatumPageMapper>()?;
24+
m.add_class::<JsonSectionPageMapper>()?;
2225

2326
Ok(())
2427
}

src/datumaro/plugins/data_formats/datumaro/importer.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
99
from datumaro.components.importer import Importer
1010
from datumaro.components.merge.extractor_merger import ExtractorMerger
11-
from datumaro.util import parse_json
11+
from datumaro.rust_api import JsonSectionPageMapper
1212

1313
from .format import DatumaroPath
1414

@@ -28,9 +28,11 @@ def detect(
2828
with context.probe_text_file(
2929
annot_file,
3030
'must be a JSON object with "categories" ' 'and "items" keys',
31-
) as f:
32-
contents = parse_json(f.read())
33-
if not {"categories", "items"} <= contents.keys():
31+
):
32+
fpath = osp.join(context.root_path, annot_file)
33+
page_mapper = JsonSectionPageMapper(fpath)
34+
sections = page_mapper.sections()
35+
if not {"categories", "items"} <= sections.keys():
3436
raise Exception
3537

3638
@classmethod

0 commit comments

Comments
 (0)