Skip to content

Commit

Permalink
U3b initial working
Browse files Browse the repository at this point in the history
  • Loading branch information
siara-cc committed Apr 12, 2024
1 parent f17d9b3 commit c427ffd
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 12 deletions.
21 changes: 21 additions & 0 deletions Unishox3_Alpha/test_samples.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
../usx3 -t
../usx3 -c ../sample_texts/chinese.txt ../sample_texts/chinese.psx && ../usx3 -d ../sample_texts/chinese.psx ../sample_texts/chinese.ds3 && cmp ../sample_texts/chinese.txt ../sample_texts/chinese.ds3
../usx3 -c ../sample_texts/emoji.txt ../sample_texts/emoji.psx && ../usx3 -d ../sample_texts/emoji.psx ../sample_texts/emoji.ds3 && cmp ../sample_texts/emoji.txt ../sample_texts/emoji.ds3
../usx3 -c ../sample_texts/french.txt ../sample_texts/french.psx && ../usx3 -d ../sample_texts/french.psx ../sample_texts/french.ds3 && cmp ../sample_texts/french.txt ../sample_texts/french.ds3
../usx3 -c ../sample_texts/hindi.txt ../sample_texts/hindi.psx && ../usx3 -d ../sample_texts/hindi.psx ../sample_texts/hindi.ds3 && cmp ../sample_texts/hindi.txt ../sample_texts/hindi.ds3
../usx3 -c ../sample_texts/japanese.txt ../sample_texts/japanese.psx && ../usx3 -d ../sample_texts/japanese.psx ../sample_texts/japanese.ds3 && cmp ../sample_texts/japanese.txt ../sample_texts/japanese.ds3
../usx3 -c ../sample_texts/json1.txt ../sample_texts/json1.psx && ../usx3 -d ../sample_texts/json1.psx ../sample_texts/json1.ds3 && cmp ../sample_texts/json1.txt ../sample_texts/json1.ds3
../usx3 -c ../sample_texts/json2.txt ../sample_texts/json2.psx && ../usx3 -d ../sample_texts/json2.psx ../sample_texts/json2.ds3 && cmp ../sample_texts/json2.txt ../sample_texts/json2.ds3
../usx3 -c ../sample_texts/json3.txt ../sample_texts/json3.psx && ../usx3 -d ../sample_texts/json3.psx ../sample_texts/json3.ds3 && cmp ../sample_texts/json3.txt ../sample_texts/json3.ds3
../usx3 -c ../sample_texts/json4.txt ../sample_texts/json4.psx && ../usx3 -d ../sample_texts/json4.psx ../sample_texts/json4.ds3 && cmp ../sample_texts/json4.txt ../sample_texts/json4.ds3
../usx3 -c ../sample_texts/spanish.txt ../sample_texts/spanish.psx && ../usx3 -d ../sample_texts/spanish.psx ../sample_texts/spanish.ds3 && cmp ../sample_texts/spanish.txt ../sample_texts/spanish.ds3
../usx3 -c ../sample_texts/tamil.txt ../sample_texts/tamil.psx && ../usx3 -d ../sample_texts/tamil.psx ../sample_texts/tamil.ds3 && cmp ../sample_texts/tamil.txt ../sample_texts/tamil.ds3
../usx3 -c ../sample_texts/xml1.txt ../sample_texts/xml1.psx && ../usx3 -d ../sample_texts/xml1.psx ../sample_texts/xml1.ds3 && cmp ../sample_texts/xml1.txt ../sample_texts/xml1.ds3
../usx3 -c ../sample_texts/world95.txt ../sample_texts/world95.psx && ../usx3 -d ../sample_texts/world95.psx ../sample_texts/world95.ds3 && cmp ../sample_texts/world95.txt ../sample_texts/world95.ds3
../usx3 -c ../sample_texts/alice_wland_chn.txt ../sample_texts/alice_wland_chn.psx && ../usx3 -d ../sample_texts/alice_wland_chn.psx ../sample_texts/alice_wland_chn.ds3 && cmp ../sample_texts/alice_wland_chn.txt ../sample_texts/alice_wland_chn.ds3
../usx3 -c ../sample_texts/alice_wland.txt ../sample_texts/alice_wland.psx && ../usx3 -d ../sample_texts/alice_wland.psx ../sample_texts/alice_wland.ds3 && cmp ../sample_texts/alice_wland.txt ../sample_texts/alice_wland.ds3
../usx3 -c ../sample_texts/hi.txt ../sample_texts/hi.psx && ../usx3 -d ../sample_texts/hi.psx ../sample_texts/hi.ds3 && cmp ../sample_texts/hi.txt ../sample_texts/hi.ds3
../usx3 -c ../sample_texts/ja.txt ../sample_texts/ja.psx && ../usx3 -d ../sample_texts/ja.psx ../sample_texts/ja.ds3 && cmp ../sample_texts/ja.txt ../sample_texts/ja.ds3
../usx3 -c ../sample_texts/ru.txt ../sample_texts/ru.psx && ../usx3 -d ../sample_texts/ru.psx ../sample_texts/ru.ds3 && cmp ../sample_texts/ru.txt ../sample_texts/ru.ds3
../usx3 -c ../sample_texts/ta.txt ../sample_texts/ta.psx && ../usx3 -d ../sample_texts/ta.psx ../sample_texts/ta.ds3 && cmp ../sample_texts/ta.txt ../sample_texts/ta.ds3
../usx3 -c ../sample_texts/zh.txt ../sample_texts/zh.psx && ../usx3 -d ../sample_texts/zh.psx ../sample_texts/zh.ds3 && cmp ../sample_texts/zh.txt ../sample_texts/zh.ds3
21 changes: 21 additions & 0 deletions Unishox3_Beta/test_samples.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
../u3b -t
../u3b -c ../sample_texts/chinese.txt ../sample_texts/chinese.psx && ../u3b -d ../sample_texts/chinese.psx ../sample_texts/chinese.ds3 && cmp ../sample_texts/chinese.txt ../sample_texts/chinese.ds3
../u3b -c ../sample_texts/emoji.txt ../sample_texts/emoji.psx && ../u3b -d ../sample_texts/emoji.psx ../sample_texts/emoji.ds3 && cmp ../sample_texts/emoji.txt ../sample_texts/emoji.ds3
../u3b -c ../sample_texts/french.txt ../sample_texts/french.psx && ../u3b -d ../sample_texts/french.psx ../sample_texts/french.ds3 && cmp ../sample_texts/french.txt ../sample_texts/french.ds3
../u3b -c ../sample_texts/hindi.txt ../sample_texts/hindi.psx && ../u3b -d ../sample_texts/hindi.psx ../sample_texts/hindi.ds3 && cmp ../sample_texts/hindi.txt ../sample_texts/hindi.ds3
../u3b -c ../sample_texts/japanese.txt ../sample_texts/japanese.psx && ../u3b -d ../sample_texts/japanese.psx ../sample_texts/japanese.ds3 && cmp ../sample_texts/japanese.txt ../sample_texts/japanese.ds3
../u3b -c ../sample_texts/json1.txt ../sample_texts/json1.psx && ../u3b -d ../sample_texts/json1.psx ../sample_texts/json1.ds3 && cmp ../sample_texts/json1.txt ../sample_texts/json1.ds3
../u3b -c ../sample_texts/json2.txt ../sample_texts/json2.psx && ../u3b -d ../sample_texts/json2.psx ../sample_texts/json2.ds3 && cmp ../sample_texts/json2.txt ../sample_texts/json2.ds3
../u3b -c ../sample_texts/json3.txt ../sample_texts/json3.psx && ../u3b -d ../sample_texts/json3.psx ../sample_texts/json3.ds3 && cmp ../sample_texts/json3.txt ../sample_texts/json3.ds3
../u3b -c ../sample_texts/json4.txt ../sample_texts/json4.psx && ../u3b -d ../sample_texts/json4.psx ../sample_texts/json4.ds3 && cmp ../sample_texts/json4.txt ../sample_texts/json4.ds3
../u3b -c ../sample_texts/spanish.txt ../sample_texts/spanish.psx && ../u3b -d ../sample_texts/spanish.psx ../sample_texts/spanish.ds3 && cmp ../sample_texts/spanish.txt ../sample_texts/spanish.ds3
../u3b -c ../sample_texts/tamil.txt ../sample_texts/tamil.psx && ../u3b -d ../sample_texts/tamil.psx ../sample_texts/tamil.ds3 && cmp ../sample_texts/tamil.txt ../sample_texts/tamil.ds3
../u3b -c ../sample_texts/xml1.txt ../sample_texts/xml1.psx && ../u3b -d ../sample_texts/xml1.psx ../sample_texts/xml1.ds3 && cmp ../sample_texts/xml1.txt ../sample_texts/xml1.ds3
../u3b -c ../sample_texts/world95.txt ../sample_texts/world95.psx && ../u3b -d ../sample_texts/world95.psx ../sample_texts/world95.ds3 && cmp ../sample_texts/world95.txt ../sample_texts/world95.ds3
../u3b -c ../sample_texts/alice_wland_chn.txt ../sample_texts/alice_wland_chn.psx && ../u3b -d ../sample_texts/alice_wland_chn.psx ../sample_texts/alice_wland_chn.ds3 && cmp ../sample_texts/alice_wland_chn.txt ../sample_texts/alice_wland_chn.ds3
../u3b -c ../sample_texts/alice_wland.txt ../sample_texts/alice_wland.psx && ../u3b -d ../sample_texts/alice_wland.psx ../sample_texts/alice_wland.ds3 && cmp ../sample_texts/alice_wland.txt ../sample_texts/alice_wland.ds3
../u3b -c ../sample_texts/hi.txt ../sample_texts/hi.psx && ../u3b -d ../sample_texts/hi.psx ../sample_texts/hi.ds3 && cmp ../sample_texts/hi.txt ../sample_texts/hi.ds3
../u3b -c ../sample_texts/ja.txt ../sample_texts/ja.psx && ../u3b -d ../sample_texts/ja.psx ../sample_texts/ja.ds3 && cmp ../sample_texts/ja.txt ../sample_texts/ja.ds3
../u3b -c ../sample_texts/ru.txt ../sample_texts/ru.psx && ../u3b -d ../sample_texts/ru.psx ../sample_texts/ru.ds3 && cmp ../sample_texts/ru.txt ../sample_texts/ru.ds3
../u3b -c ../sample_texts/ta.txt ../sample_texts/ta.psx && ../u3b -d ../sample_texts/ta.psx ../sample_texts/ta.ds3 && cmp ../sample_texts/ta.txt ../sample_texts/ta.ds3
../u3b -c ../sample_texts/zh.txt ../sample_texts/zh.psx && ../u3b -d ../sample_texts/zh.psx ../sample_texts/zh.ds3 && cmp ../sample_texts/zh.txt ../sample_texts/zh.ds3
29 changes: 19 additions & 10 deletions Unishox3_Beta/unishox3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,27 +403,36 @@ usx3_dict_find unishox3::match_predef_dict(const char *in, int len, int l) {
int max_len_pos = -1;
int max_len_lvl = LATIN_DICT_LVL_MAX;
int pos_lvl = LATIN_DICT_LVL_MAX;
strncpy(key, in + l, key_len);
madras_dv1::dict_iter_ctx ctx[7];
for (int i = 0; i < 7; i++)
ctx[i].init(tries[i].get_max_key_len(), tries[i].get_max_level());
for (; pos_lvl >= 0; pos_lvl--) {
strncpy(key, in + l, key_len);
// printf("Key: %.*s, len: %d\n", key_len, in+l, key_len);
if (pos_lvl != LATIN_DICT_LVL_MAX) {
if (in[l] >= 'A' && in[l] <= 'Z')
key[0] += ('a' - 'A');
}
key[key_len] = 0;
//printf("key: %s\n", key);
madras_dv1::dict_lookup_ctx ctx(tries[pos_lvl].get_max_key_len());
tries[pos_lvl].find_first((uint8_t *) key, key_len, ctx);
if (max_len < ctx.last_leaf_key_len) {
max_len = ctx.last_leaf_key_len;
max_len_pos = ctx.last_leaf_node_id;
max_len_lvl = pos_lvl;
printf("Found at lvl: %d, pos: %u, key_len: %d, key: %.*s\n", pos_lvl, max_len_pos, max_len, max_len, ctx.key);
// printf("%d, %.*s\n", key_len, key_len, key);
tries[pos_lvl].find_first((uint8_t *) key, key_len, ctx[pos_lvl]);
if (ctx[pos_lvl].last_leaf_set) {
int dict_key_len = ctx[pos_lvl].key_len - ctx[pos_lvl].last_leaf_len_offset;
if (max_len < dict_key_len) {
max_len = dict_key_len;
max_len_pos = ctx[pos_lvl].last_leaf_node_id;
max_len_lvl = pos_lvl;
// printf("Found at lvl: %d, pos: %u, key_len: %d, key: %.*s\n", pos_lvl, max_len_pos, max_len, max_len, ctx[pos_lvl].key);
}
}
// ctx.close();
}
if (max_len > 0) {
pos = tries[max_len_lvl].leaf_rank(max_len_pos);
found_len = max_len;
printf("Leaf rank: %u\n", pos);
// printf("Leaf rank: %u\n", pos);
}
return usx3_dict_find(max_len_lvl, pos, found_len);
}
Expand Down Expand Up @@ -551,7 +560,7 @@ int unishox3::encode_dict_matches(const char *in, int len, int l, char *out, int
return l;
} else {
if (continuous) {
if (in[l] >= 'A' && in[l] <= 'Z')
if (in[l] >= 'A' && in[l] <= 'Z' && dict_find.lvl < 6)
SAFE_APPEND_BITS(*ol = append_bits(out, olen, *ol, 0x80, 5)); // next upper
SAFE_APPEND_BITS(*ol = append_bits(out, olen, *ol, 0x00, 2)); // end suffix and next from dictionary
//printf("`");
Expand All @@ -565,7 +574,7 @@ int unishox3::encode_dict_matches(const char *in, int len, int l, char *out, int
SAFE_APPEND_BITS(*ol = switch_to(out, olen, *ol, *state, USX_ALPHA));
*is_all_upper = 0;
}
if (in[l] >= 'A' && in[l] <= 'Z')
if (in[l] >= 'A' && in[l] <= 'Z' && dict_find.lvl < 6)
SAFE_APPEND_BITS(*ol = switch_to(out, olen, *ol, *state, USX_ALPHA));
SAFE_APPEND_BITS(*ol = switch_to(out, olen, *ol, *state, USX_PREDEF_DICT));
if (!continuous_bit_loc)
Expand Down
Loading

0 comments on commit c427ffd

Please sign in to comment.