Skip to content

Commit 97760fb

Browse files
authored
Ucdxml 17v1 (#1104)
* Initial checkin to support Unikemet * Added note about grouping Unihan attributes, and more work for Unikemet * Removed Deprecated properties, removed sections that only contained historical data, and added partial support for kEH_AltSeq * Updated version information * Renamed index to tr42 to make it easier to copy to unicode-reports * Added removed/changed support * kGB7 is a removed property * Latest Unihan, Tangut, and Nushu changes
1 parent 3182a20 commit 97760fb

37 files changed

+122712
-117769
lines changed

docs/ucdxml.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ We'll use [jing-trang](https://github.com/relaxng/jing-trang) in this example.
6464
1. Clone and build [jing-trang](https://github.com/relaxng/jing-trang)
6565
2. Run the following:
6666
```
67-
java -jar C:\_git\jing-trang\build\jing.jar -c UNICODETOOLS_REPO_DIR\uax\uax42\output\index.rnc <path to UAX xml file>
67+
java -jar C:\_git\jing-trang\build\jing.jar -c UNICODETOOLS_REPO_DIR\uax\uax42\output\tr42.rnc <path to UAX xml file>
6868
```
6969
Note that the UAX xml file has to be saved as NFD as the Unihan syntax regular expressions are expecting NFD.
70+
71+
To convert to NFD, use ICU's uconv.exe:
72+
```
73+
uconv.exe uconv -f utf8 -t utf8 -x nfd -o {outputfile} {originalfile}
74+
```
75+

unicodetools/data/ucdxml/dev/ucd.nounihan.grouped.xml

Lines changed: 17771 additions & 18073 deletions
Large diffs are not rendered by default.

unicodetools/data/ucdxml/dev/ucd.unihan.grouped.xml

Lines changed: 103025 additions & 98691 deletions
Large diffs are not rendered by default.

unicodetools/data/ucdxml/dev/ucdxml.readme.txt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
1-
XML Representation of Unicode 16.0.0 UCD
1+
XML Representation of Unicode 17.0.0 UCD
22

33

4-
© 2024 Unicode®, Inc.
4+
© 2025 Unicode®, Inc.
55
For terms of use, see https://www.unicode.org/terms_of_use.html
66

77

8-
This directory contains the representation in XML of Version 16.0.0 of
8+
This directory contains the representation in XML of Version 17.0.0 of
99
the UCD, using the schema defined by UAX #42: Unicode Character
1010
Database in XML, at https://www.unicode.org/reports/tr42/
1111

1212
While every effort has been made to ensure consistency of the
1313
XML representation with the UCD files, there may be some errors;
1414
the UCD files are authoritative.
1515

16-
1716
There are six files, available in zip/jar format:
1817
- flat vs. grouped
1918
- no Unihan data vs. Unihan data only vs. complete UCD.

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,36 @@ public enum UcdProperty {
217217
kEACC(PropertyType.Miscellaneous, DerivedPropertyStatus.Provisional, "cjkEACC"),
218218
kEH_Cat(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "kEH_Cat"),
219219
kEH_Desc(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "kEH_Desc"),
220-
kEH_FVal(PropertyType.Miscellaneous, DerivedPropertyStatus.Provisional, "kEH_FVal"),
221-
kEH_Func(PropertyType.Miscellaneous, DerivedPropertyStatus.Provisional, "kEH_Func"),
222-
kEH_HG(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "kEH_HG"),
223-
kEH_IFAO(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "kEH_IFAO"),
224-
kEH_JSesh(PropertyType.Miscellaneous, DerivedPropertyStatus.Approved, "kEH_JSesh"),
220+
kEH_FVal(
221+
PropertyType.Miscellaneous,
222+
DerivedPropertyStatus.Provisional,
223+
null,
224+
ValueCardinality.Unordered,
225+
"kEH_FVal"),
226+
kEH_Func(
227+
PropertyType.Miscellaneous,
228+
DerivedPropertyStatus.Provisional,
229+
null,
230+
ValueCardinality.Unordered,
231+
"kEH_Func"),
232+
kEH_HG(
233+
PropertyType.Miscellaneous,
234+
DerivedPropertyStatus.Approved,
235+
null,
236+
ValueCardinality.Unordered,
237+
"kEH_HG"),
238+
kEH_IFAO(
239+
PropertyType.Miscellaneous,
240+
DerivedPropertyStatus.Approved,
241+
null,
242+
ValueCardinality.Unordered,
243+
"kEH_IFAO"),
244+
kEH_JSesh(
245+
PropertyType.Miscellaneous,
246+
DerivedPropertyStatus.Approved,
247+
null,
248+
ValueCardinality.Unordered,
249+
"kEH_JSesh"),
225250
kEH_UniK(PropertyType.Miscellaneous, DerivedPropertyStatus.Provisional, "kEH_UniK"),
226251
kFanqie(
227252
PropertyType.Miscellaneous,

unicodetools/src/main/java/org/unicode/xml/AttributeResolver.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
*/
2323
public class AttributeResolver {
2424

25+
static final String SET_SEPARATOR = "|";
2526
private final IndexUnicodeProperties indexUnicodeProperties;
2627
private final UnicodeMap<UcdPropertyValues.Age_Values> map_age;
2728
private final UnicodeMap<UcdPropertyValues.Block_Values> map_block;
@@ -147,6 +148,9 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
147148
case kOtherNumeric:
148149
case kPrimaryNumeric:
149150
case kAccountingNumeric:
151+
if (resolvedValue != null) {
152+
resolvedValue = resolvedValue.replaceAll("\\" + SET_SEPARATOR, " ");
153+
}
150154
return (resolvedValue.equals("NaN")) ? null : resolvedValue;
151155
default:
152156
return Optional.ofNullable(resolvedValue).orElse("NaN");
@@ -211,7 +215,7 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
211215
return resolvedValue;
212216
default:
213217
if (resolvedValue != null) {
214-
return resolvedValue.replaceAll("\\|", " ");
218+
return resolvedValue.replaceAll("\\" + SET_SEPARATOR, " ");
215219
}
216220
return "";
217221
}
@@ -226,7 +230,8 @@ public String getAttributeValue(UcdProperty prop, int codepoint) {
226230
return map_script.get(codepoint).getShortName();
227231
case Script_Extensions:
228232
StringBuilder extensionBuilder = new StringBuilder();
229-
String[] extensions = map_script_extensions.get(codepoint).split("\\|", 0);
233+
String[] extensions =
234+
map_script_extensions.get(codepoint).split("\\" + SET_SEPARATOR, 0);
230235
for (String extension : extensions) {
231236
extensionBuilder.append(
232237
UcdPropertyValues.Script_Values.valueOf(extension)
@@ -348,4 +353,8 @@ public boolean isUnifiedIdeograph(int codepoint) {
348353
return getAttributeValue(UcdProperty.Unified_Ideograph, codepoint).equals("Y")
349354
&& getAttributeValue(UcdProperty.Name, codepoint).equals("CJK UNIFIED IDEOGRAPH-#");
350355
}
356+
357+
public boolean isUnikemetAttributeRange(int codepoint) {
358+
return !getAttributeValue(UcdProperty.kEH_Cat, codepoint).isEmpty();
359+
}
351360
}

0 commit comments

Comments
 (0)