diff --git a/benches/get_path.rs b/benches/get_path.rs index 8bfc8b0..df040d0 100644 --- a/benches/get_path.rs +++ b/benches/get_path.rs @@ -78,6 +78,11 @@ fn add_benchmark(c: &mut Criterion) { paths: vec!["search_metadata", "max_id_str"], expected: "505874924095815681", }, + TestSuite { + file: "status_string_values", + paths: vec!["status", "status_fSw0bx7gb5GpJHrw2JDN_kgke"], + expected: "sViysVYYl35oR", + }, ]; for test_suite in test_suites { diff --git a/data/status_string_values.json b/data/status_string_values.json new file mode 100644 index 0000000..35e01f0 --- /dev/null +++ b/data/status_string_values.json @@ -0,0 +1,204 @@ +{ + "status": { + "status_3HZSdm_HCqlgO17uMXbEit": "5k5QXd3B3v4XwnVif", + "status_5fcW9TvCcy538mix7": "BCDr7xfKWBHIaS0OmN", + "status_l1fphOQ": "RoArORV4dh1FDR", + "status_8MpE6JXKg6lTxa1zJoF_p": "z3ditT", + "status_aWQbZxgdKNY1CVDC": "rzJXHwKBGi", + "status_TGJYJZ": "uE3j", + "status_8BOCdED3V0aZ4zN5P8cBk_1X": "WruMlfsQ3hxFCsf", + "status_Dd1B4WCqs": "xOp4SC4Vl4", + "status_Lb0i8abFiC": "FbUCHBk9PML1oE", + "status_tOMVs": "xKaeSIZmA8hROdVCTi", + "status_1bVsZjdNjC": "isCgD", + "status_PE": "4QU94yEkWw0", + "status_EDpamPvwOrQmoP": "sQry7fys", + "status_YtT8QrTSWsQHD77kNc1nFwcA": "nzFTwB7Dfo08ykm", + "status_aZ4ca": "MN1qjd9tWmbX", + "status_cb05ZpeB_O": "5swM8k", + "status_WgjNrD_qD": "nbnB9FYrxtWgJ6FEghY3", + "status_WYf6nseJoWORI": "8BGSXwoEI", + "status_fLsGMbcj6ynP": "LLB2HsZO98Rg", + "status_FioVA4AEU": "kiKxXyVkEcaEoJ", + "status_vwHZaJFBBp_kG": "nT7Un4Yox9mZHL", + "status_tEeS_V6FQghtdDD2jDd6oJ4Al": "A9vMXHSzAI5x", + "status_sDOOJbshKrb73kyV": "TUc4c", + "status_YfGgR0ZuM": "IYafnX9lK9Gu", + "status_tLjPLDSiS4iu5W73XFp": "tkZxxa1HyAOv7iEN9WiZ", + "status_WO5jueYaG": "nnHDjC6f7iF3k8", + "status_ioX6c1wHI1pM5dCTyGBp": "Iax7QAC", + "status_M": "GBeTTbFb85HF5dIJdR1C", + "status_1tVPhhKc46j2hh88I": "0kM0YoaZoUjKBl", + "status_x08IF2gyUMYzvqu5JtGz1": "VogQ9nJO", + "status_Sp5LeH8E7N": "tCDzGKLKrA2egO8R4F8", + "status_4ZQZ": "tyfYA1AFGNO3", + "status_Aa7KMeiD": "lyJKa", + "status_w4XhsO3YWH9YStDvEa0": "KQ2RqygXSdkD", + "status_fcPqhSVWCYa7qDgUfGmq4W": "2ez", + "status_GOA_KGCf_tZlSgkYgdyG": "2cKf6K", + "status_Moqkzu47WvIsPtZTuX2543RMA": "pQRgfN3H6WQx0rf", + "status_nh": "Zc1TKJtOVruoiLiC2blX", + "status_dR0v": "iGgC4LhcLngK5", + "status_FBMYF3BYN6gcLBJVsgoByAsK": "A9ZUNTGRC3", + "status_ytxYN2o": "cuRyFvsH0l7FGkvgl", + "status_W2": "2ibL7CgfIaC", + "status__rjsqTubCnX7NGpkwsC13tCr": "XRUW", + "status_rpfyTt3Y1WzuJSMTQU7sMERCR": "N60y8lAWqrArVw8Fj", + "status_cLISD": "Esc7OI6exYvwp6HWtk", + "status_51kCCq8uiRAxfQu3HEsGthmi7": "3iQEst", + "status_9nD3ZctQSw1N_": "JawMrw4n", + "status_kqjO_yWldzM": "iMa", + "status_omcxEpNnq61": "JGurWsknN", + "status_Hp": "2rbBD7yEy3GTFsO", + "status_9J6yirDqVkE_MDhX": "UHL4Mn5dOXtroasyV", + "status_NzeC_NVv": "3SJGrhJpUndBP", + "status_wIWyVUNeE1YSX3pgo1uo": "hAtlXWKIV5", + "status_IbbxTijnV368WtpzZs": "qfuy", + "status_0Gza2YVAVnToW7XkVcugKO72r": "ZbWO66Fq0yx0", + "status_3lW76Q0IfY": "78OH2SM31XEkVf", + "status_3ZMvx8bEN": "Gl0MfhW8qqRGT3dqcL7", + "status_UGWw2": "ke4n", + "status_EiSQ8NjiFNarCDwKwM": "PjIXxeSIKlNsFdqu", + "status_3ugc0pLG9HR0D": "EaKxtD", + "status__eGLxg87E_h1Tqgdexp": "Is3F09ljTSSZAguV2", + "status_uz29v7kqNe": "W6B78O5fls5f", + "status_6dh3Z0kioxiQRGWzd7W0ew": "z33lAwSI72eaQTAZF", + "status_gvBTR5R0ykYrmQJ": "Sv10tVzMaRxN1C1", + "status_I1t9E8tr6id1HFP0qbC": "AMV2M1mrd0EtlEOBq", + "status_leRrBmKPU3oVu": "1stNjGH3kWKsHfWQvO", + "status_Q_nB8eGgRdwntHA_UwypzaKd": "N4Q", + "status_cZcAaaJCg_W8d8": "hSDmIRwvWhJoRLy1MUR", + "status_xmf0PkIHt2jn_YFO": "QDdGylR", + "status_yiFRUrtgsXx0t9m7KIELGQAD": "MJRSwn0mNTYYDHMKe", + "status_rUVp5wWP_WfdLyz4qbhe": "ow0Ajot9MnYrK", + "status_HmvvLtwuVo3ruyB": "gMwfypeFO7U3v", + "status_5jNHtPD": "bMlaS1QTjn73d", + "status_OHYw_fTTm": "6dw6qLF6BBy", + "status_Y4UCvy__p3pngcL": "vEUa", + "status_t0": "glMqEim", + "status__iRG1C9QfaslOjzj63": "hgaqBy", + "status_g5SXpYp2Ez": "qOHGTz6TShQpyyjA", + "status_2UEyS5PFQG3daIMTvln7yIqm": "9niWjOXip3tj", + "status_0T_Mr": "SW02SsJ3jU", + "status_VVXr1": "xQklIuLczdpC", + "status_JmAZ9jGd5V48v8AhJED994Q": "8vB48VHJCQPbtedbb", + "status_SgtOMeXNB_70POD7zAw": "dmJVnbTKnYJn", + "status_m_esobKOdG8Qpe1qG": "9964J6", + "status_KeU8CIeJ": "8iKcen2GmNMbP2", + "status_Gi6_TVmSJaaI0tgMOfFov": "KfJh", + "status_XkNijJkUBPtCM": "2wByq4TV", + "status_ceYgK57J": "fAbctKkjkIjT0wGFLQQ", + "status_8JFf_InKIuqSaTL36tWSLI": "tRCmSIhHOOxnuAo", + "status_nIiL": "0IMqglY90IKR4W", + "status_iFaII": "chPVV", + "status_9twS72K8": "l97PXFyoRU", + "status__": "diSNfDE1hmBJZZ9WtRuV", + "status_y5ETMK_rm7pQeQWUSpE": "G0RQta", + "status_F52rLU": "MnMV2wCGu", + "status_SHO6Zy8eZ": "OtH4slWqIno", + "status_XaIRDVkrScMZlqAOsnryZC": "R4Qne", + "status_OX7ZRxZ": "X24vKrWq36nuXp73", + "status_Ycu3Th2XvQ": "2uBStl", + "status_Og0sZ": "JpSCB", + "status_oeAkxKo7b": "vo4G4pUviMto", + "status_4QX5v": "3DfgeznZxw", + "status_jslApqI5lZPkrQylNmZ2WH": "ft4nQJDtNhe3k0UE8rF", + "status_F_Y8IqfqtzIUbznGXVmfyx": "uX5nKBepuIGHtnFtj", + "status_CW8hpa07b60BHE": "fO2Pvv", + "status_mHyyByIKRiaclZsIUTOS6myt": "MvFMf0mdoiI72rm", + "status_mssjHx": "Gx3P9wy5", + "status_aOvN4NFZIHA5wynF4Ptm": "lbiRSRrgBJGNr", + "status_RutnPRQLgnfrbEW": "tsmNxe", + "status_fSw0bx7gb5GpJHrw2JDN_kgke": "sViysVYYl35oR", + "status_P0HeOOq4srIfSXQYz": "Lua5pZn3fKoL6PjNl", + "status_RYyu6Kuct": "3RXX38SLBWglmRq8", + "status_F8siV33gXZC5pIrZ1ap": "ODe2M", + "status_TNHBCO": "2vVpud2RPlA3tAu", + "status_dOkzz6pGtbNUCaSf_U1bcU": "UQIgR4kz", + "status_o7": "vBkIglZ5BGJfUNir", + "status_mfOp6": "yRqcwgaIdrzH7VKd", + "status_99p1x": "hbquEjsMCYoZ1wQxKC", + "status_rp14oyA_UsNG_78sIFlS_Ml2G": "cTKNH49okNrEJF2YVrt", + "status_i6dkd77U": "1vEMXh6MmHde9", + "status_1QzNyG0Hjd4b4S": "ia6MxKarfY0D", + "status_RWLlL": "LkCvV9hadnz3f", + "status_hID7QT": "rpVz0HgDNXBr6XIb", + "status_hYmqajk": "oTsW79JGPMeVTALq", + "status_6Tftm6EsdEzvCvJLnQ": "StEeeYi7jJ0dCdnP", + "status_AMGVLAAxwHQ2geYNl7lxzNgH": "j9X7HGxwcBgNEWiz08g", + "status_TJubtfWuLFqa_NWIZB": "SJVUzjI5crZ", + "status_18MeKfdNYRV6": "PQFRFL242YGilueAcPL", + "status_OBTHfpYVqATftq0bmzzhReS": "rKvksfaYOoM9hL", + "status_YSpYbHEKVMkKWb": "WlmUkvhzRncK", + "status_G5XBbnAFPQKd": "pxl4", + "status_sQVzBPD1D2YrdyUDCaF": "nTXGzUo", + "status_VK5B0B": "lh2PAfuLyMSaGptU3em", + "status_TGjtIH4kQzLY": "WFD9G", + "status_Ff9xzH6AAUvktZQMwkMvVV": "lQHOdqcAR", + "status_OswLB249i": "sskxSuuDJV5IEtk", + "status_vVyu": "ASw58iAE2MhL2ZNdn", + "status_zXJIqrAgqKub": "JKzU6FKT1e5t8", + "status_fvS3": "0d8CDw", + "status_KtU246pGq5Llc7GrzeSW6r": "aBzm25xa8vdx5Ft", + "status_BPT3t0LTF_U2VTLqu3lzEtw": "ekotPsoIaZhR7", + "status_mloUNS2ej8bAMf": "Q9DdXwGzj16n9HCK9F", + "status_t1uRI": "0lM1nZP53Zq8hg9q", + "status_epZKJLNrcscfWQiCJpCd": "Ww36OHHBaW9h", + "status_7EPhM": "mFyo1p", + "status_99iJPHquHpo": "iSrf81", + "status_5Zcjk469HFtronDV": "KveHfZDD", + "status_zdKLWn": "Lmn", + "status__cUMGj8fJH27OXwX": "4QOP", + "status_7ogm4NMeF": "F8yoljp", + "status_QXqAJxqfIwp9x8": "AoehPiVZPtH94eM3", + "status_DBjal97iTgP": "0ghmqNbb3pFiNemkr", + "status_p5wBQ3E8iyv82gT": "qyq", + "status_9W_IqbNYwywPv5JzPCkYWGsA": "kgK", + "status_KE2myTI6dCs0BHsKbpbVPP4": "FHpVlOgNH2XQfedLffS8", + "status_alGuUu7i7KE7KGESr9aX": "qBXMDHYRG", + "status_40Zq": "K3kHgNHffP5O", + "status_tNgnfAJDoHso": "YhcwkKW3BdFaq43ATS", + "status_IcI3ikkcyF1": "GP5vpN5", + "status_hT1OgKahzxxo7a": "NxdLQZrk7", + "status_9Bm": "42jAYqDauNQYUqSr", + "status_fwPAHQQGMF": "xkQuX94fEyE", + "status_b2FkdVx7xEvsy_dyBtZL3": "MvPdLj7aTm", + "status_r8tbbp": "BZZi8m9AeOad", + "status_rE0knFLAmfi42L75": "puM2Zp7zGqyxfP5", + "status_cV": "MNL", + "status_uLGkoODizgIKQCS0": "Xrp904", + "status_gin3Z9u": "EVED", + "status_LPGgxg29oBFblIlMZu": "pCY3xt", + "status_Zsl8YFiT": "gq8ziv0iIwYAQfuZ79", + "status_AAG": "EGTX8EHDkCRLjP", + "status_fB": "UEw", + "status_sRjy386HXKXwPaD5OHyQhCK_B": "clCJjaTvFtztTx5", + "status_U_MB2IqX9l3lPWyug_OOfEyR2": "i6Y9je4i0ItCwaSD3", + "status_pd": "hhslE1zorT9mSF", + "status_fmYrcippPcTMJtXOp9T": "IYYPwhXpFUz", + "status_jrO1TvW24AagUCv": "h6GWy1", + "status_mYW0pA1ah": "5YFhzsGRejEIkQThA7", + "status_0XC63MrRKrAMHweInM": "LtZJvDp0aW24OxATYk", + "status_1ealCf7r0N8cBEEe0pEA": "lJOiDFe", + "status_s656c3pC0MMkWt13_Srd_sQ": "KNNYgivJxMhN", + "status_wyeDEmzPH26CvoPEYib1Bq": "RyJUMybugOBpa1968zW", + "status_kmaBkpj5xndSgd377vcoB": "XwJtI0Ok", + "status_OU_iZvBmRvmw7ajk": "FkcM4C1Bz3G", + "status_T4e": "ZW4qWMiyA62USk1", + "status_tfJ": "sEc0ThjrtvfJ6FS", + "status_BJ5rpwyOKR4a_65xi": "cz0bZb4QbM", + "status_7yqKUI0k1": "c6OupB", + "status_zlZYxu": "6uCZGu7qu1Yj5", + "status_GU9DGcAvz5e2iR2EY": "ZbB4CZg7RG7AeHgXue", + "status_B": "egyA", + "status_OpQdb4diuSv0y8": "kHWb", + "status_OdwFsROg": "fbXchC", + "status_Ed5kmO6N0oMEQNNgKn": "bIsVPlo7pH", + "status_5So": "9a2eHok", + "status_bYHDzU3": "0qZJK1", + "status_khtwU1CFLN": "rTNgQK0pk3Wuhk7i", + "status_Qx69ZvhDS13Iu6IFNh0eBfpM": "wDch5", + "status_N": "yQTp9UBy1dexj8pK", + "status_GCiFXdjWQwQVbUfG": "RlbVrrf4SUevJ1w" + } +} \ No newline at end of file diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 73cb934..9d970ea 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "stable" +channel = "nightly-2025-03-25" components = ["rustfmt", "clippy"] diff --git a/src/core/databend/util.rs b/src/core/databend/util.rs index affd02b..65d0854 100644 --- a/src/core/databend/util.rs +++ b/src/core/databend/util.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +use byteorder::BigEndian; +use byteorder::WriteBytesExt; use core::ops::Range; use std::borrow::Cow; use std::io::Write; - -use byteorder::BigEndian; -use byteorder::WriteBytesExt; +use std::simd::cmp::SimdPartialEq; +use std::simd::u32x8; use super::constants::*; use super::jentry::JEntry; @@ -81,16 +82,130 @@ impl<'a> RawJsonb<'a> { } } + #[inline] + #[cfg(target_arch = "x86_64")] + pub(crate) fn get_object_value_by_key_name_simd( + &self, + length: usize, + key_name: &Cow<'a, str>, + ignore_ascii_case: bool, + ) -> Result>> { + let mut item_offset = 4 + 8 * length; + + let name_bytes = key_name.as_bytes(); + let mut index = None; + + for i in 0..length / 8 { + if index.is_some() { + break; + } + index = + self.find_matched_pos_simd_x8(i, name_bytes, &mut item_offset, ignore_ascii_case)?; + } + for i in length - length % 8..length { + if index.is_some() { + break; + } + let jentry = self.read_jentry((i + 1) * 4)?; + let key_len = jentry.length as usize; + item_offset += key_len; + + if name_bytes.len() == key_len { + let key_range = Range { + start: item_offset - key_len, + end: item_offset, + }; + let data = self.slice(key_range)?; + if Self::compare(data, name_bytes, ignore_ascii_case) { + index = Some(i); + break; + } + } + } + let Some(mut index) = index else { + return Ok(None); + }; + let val_index = index; + index += 1; + // skip rest keys and values. + for i in index..(length + val_index) { + let jentry = self.read_jentry((i + 1) * 4)?; + item_offset += jentry.length as usize; + } + let jentry = self.read_jentry((length + val_index + 1) * 4)?; + let value_len = jentry.length as usize; + + let value_range = Range { + start: item_offset, + end: item_offset + value_len, + }; + let data = self.slice(value_range)?; + let value_item = jentry_to_jsonb_item(jentry, data); + Ok(Some(value_item)) + } + + #[cfg(target_arch = "x86_64")] + fn find_matched_pos_simd_x8( + &self, + group_i: usize, + target: &[u8], + offset: &mut usize, + ignore_ascii_case: bool, + ) -> Result> { + let len_x8 = self.load_jentry_lens((group_i * 8 + 1) * 4)?; + let target_len = u32x8::splat(target.len() as u32); + let mask = len_x8.simd_eq(target_len); + + for i in 0..8 { + let key_len = len_x8[i] as usize; + *offset += key_len; + + if mask.test(i) { + let key_range = Range { + start: *offset - key_len, + end: *offset, + }; + let data = self.slice(key_range)?; + if Self::compare(data, target, ignore_ascii_case) { + return Ok(Some(i + group_i * 8)); + } + } + } + Ok(None) + } + + #[cfg(target_arch = "x86_64")] + #[inline] + fn load_jentry_lens(&self, start: usize) -> Result { + let mut array = [0u32; 8]; + for (i, len) in array.iter_mut().enumerate() { + *len = self.read_u32(start + i * 4)? & JENTRY_OFF_LEN_MASK; + } + Ok(u32x8::from(array)) + } + + #[inline] + fn compare(a: &[u8], b: &[u8], ignore_ascii_case: bool) -> bool { + if ignore_ascii_case { + a.eq_ignore_ascii_case(b) + } else { + a.eq(b) + } + } + pub(crate) fn get_object_value_by_key_name( &self, key_name: &Cow<'a, str>, - eq_func: impl Fn(&[u8], &[u8]) -> bool, + ignore_ascii_case: bool, ) -> Result>> { let (header_type, header_len) = self.read_header(0)?; if header_type != OBJECT_CONTAINER_TAG || header_len == 0 { return Ok(None); } let length = header_len as usize; + + #[cfg(target_arch = "x86_64")] + return self.get_object_value_by_key_name_simd(length, key_name, ignore_ascii_case); let mut index = 0; let mut jentry_offset = 4; let mut item_offset = 4 + 8 * length; @@ -113,7 +228,7 @@ impl<'a> RawJsonb<'a> { end: item_offset, }; let key_data = self.slice(key_range)?; - if eq_func(name_bytes, key_data) { + if Self::compare(name_bytes, key_data, ignore_ascii_case) { key_matched = true; break; } diff --git a/src/functions/path.rs b/src/functions/path.rs index b8b6bae..77d688c 100644 --- a/src/functions/path.rs +++ b/src/functions/path.rs @@ -138,16 +138,12 @@ impl RawJsonb<'_> { /// ``` pub fn get_by_name(&self, name: &str, ignore_case: bool) -> Result> { let key_name = Cow::Borrowed(name); - if let Some(val_item) = - self.get_object_value_by_key_name(&key_name, |name, key| key.eq(name))? - { + if let Some(val_item) = self.get_object_value_by_key_name(&key_name, false)? { let value = OwnedJsonb::from_item(val_item)?; return Ok(Some(value)); } if ignore_case { - if let Some(val_item) = self.get_object_value_by_key_name(&key_name, |name, key| { - key.eq_ignore_ascii_case(name) - })? { + if let Some(val_item) = self.get_object_value_by_key_name(&key_name, true)? { let value = OwnedJsonb::from_item(val_item)?; return Ok(Some(value)); } @@ -258,9 +254,7 @@ impl RawJsonb<'_> { KeyPath::Index(index) => Cow::Owned(index.to_string()), KeyPath::Name(name) | KeyPath::QuotedName(name) => Cow::Borrowed(name), }; - if let Some(val_item) = - current.get_object_value_by_key_name(&name, |name, key| key.eq(name))? - { + if let Some(val_item) = current.get_object_value_by_key_name(&name, false)? { current_item = val_item; } else { return Ok(None); diff --git a/src/jsonpath/selector.rs b/src/jsonpath/selector.rs index f3f7739..9692382 100644 --- a/src/jsonpath/selector.rs +++ b/src/jsonpath/selector.rs @@ -223,9 +223,7 @@ impl<'a> Selector<'a> { }; let key_name = Cow::Borrowed(name); - if let Some(val_item) = - curr_raw_jsonb.get_object_value_by_key_name(&key_name, |name, key| key.eq(name))? - { + if let Some(val_item) = curr_raw_jsonb.get_object_value_by_key_name(&key_name, false)? { self.items.push_back(val_item); } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 03d2848..848e5ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#![feature(portable_simd)] + //! `jsonb` is a binary format `JSON` representation inspired by [PostgreSQL](https://www.postgresql.org/docs/current/datatype-json.html) and [CockroachDB](https://www.cockroachlabs.com/docs/stable/jsonb). It provides a fast, lightweight and easy-to-use API for working with `JSON` data. //! //! ## Features